### TEST 1

In [1]:
import pandas as pd

# --- helper (unchanged) -------------------------------------------------
def _rounds_from_score(score_str: str) -> int | None:
    if not isinstance(score_str, str):
        return None
    try:
        a, b = score_str.replace("-", "-").split("-")   # handles weird dash
        return int(a) + int(b)
    except ValueError:
        return None

DEFAULT_AGENT_ROLE = {  # ← same mapping as before
    # Duelists
    "raze": "Duelist", "jett": "Duelist", "reyna": "Duelist",
    "yoru": "Duelist", "phoenix": "Duelist", "neon": "Duelist",
    # Initiators
    "sova": "Initiator", "skye": "Initiator", "fade": "Initiator",
    "gekko": "Initiator", "breach": "Initiator", "kayo": "Initiator",
    # Controllers
    "omen": "Controller", "brimstone": "Controller", "astra": "Controller",
    "harbor": "Controller", "vyse": "Controller",
    # Sentinels
    "cypher": "Sentinel", "killjoy": "Sentinel", "sage": "Sentinel",
    "deadlock": "Sentinel", "chamber": "Sentinel",
}

# --- main builder (dictionary style .agg) -------------------------------
def build_player_kill_dataset(
    csv_path: str,
    agent_role_map: dict[str, str] | None = None,
    return_full: bool = False,
):
    role_map = (agent_role_map or DEFAULT_AGENT_ROLE).copy()

    df = pd.read_csv(csv_path)
    df["RoundsPlayed"] = df["MapScore"].apply(_rounds_from_score)

    agg = (df
           .groupby(["MatchID", "Player", "Agent"])
           .agg({
               "Kills": "sum",
               "ACS": "mean",
               "Assists": "sum",
               "Deaths": "sum",
               "FirstKills": "sum",
               "RoundsPlayed": "sum"
           })
           .reset_index())

    # map agent → role
    agg["Agent_lc"] = agg["Agent"].str.lower()
    agg["Role"] = agg["Agent_lc"].map(role_map).fillna("Unknown")

    # one-hot encode role
    encoded = pd.get_dummies(agg, columns=["Role"], prefix="Role")

    # split features / target
    X = encoded.drop(columns=["MatchID", "Player", "Agent",
                              "Agent_lc", "Kills"])
    y = encoded["Kills"]

    return (X, y, encoded) if return_full else (X, y)


In [8]:
# EMEA example
X, y = build_player_kill_dataset("emea.csv")

# If you want to inspect
X, y, df_dbg = build_player_kill_dataset("emea.csv", return_full=True)
print(X.head())


     ACS  Assists  Deaths  FirstKills RoundsPlayed  Role_Controller  \
0  147.0        3      14         0.0            0            False   
1  215.0        5      13         2.0            0            False   
2  304.5       18      20         3.0            0             True   
3  233.5       10      27        13.0            0            False   
4  159.5       13      20         2.0            0            False   

   Role_Duelist  Role_Initiator  Role_Sentinel  Role_Unknown  
0         False            True          False         False  
1         False           False          False          True  
2         False           False          False         False  
3          True           False          False         False  
4         False            True          False         False  


In [49]:
from xgboost import XGBRegressor
import numpy as np

n_boot = 500
preds = np.empty((n_boot, len(y)))

for i in range(n_boot):
    sample_idx = np.random.choice(len(y), replace=True, size=len(y))
    X_s, y_s = X.iloc[sample_idx], y.iloc[sample_idx]
    model = XGBRegressor(max_depth=6, n_estimators=200, learning_rate=0.1)
    model.fit(X_s, y_s)
    preds[i] = model.predict(X)

# mean prediction & 95 % CI for each row
pred_mean = preds.mean(axis=0)
pred_low  = np.percentile(preds, 2.5, axis=0)
pred_high = np.percentile(preds, 97.5, axis=0)


KeyboardInterrupt: 

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# ------------------------------------------------------------
# 1.  Build dataset
# ------------------------------------------------------------
def build_player_map_opponent_dataset(csv_path: str):
    """Return X (one-hot), y, groups (MatchID), and raw df."""
    df = pd.read_csv(
        csv_path,
        usecols=["MatchID", "Map", "Team", "Player", "Kills"]
    )

    # ---------- derive OpponentTeam ----------------------------------------
    def get_opponent(series):
        teams = series.unique()
        if len(teams) == 2:
            return series.replace({teams[0]: teams[1], teams[1]: teams[0]})
        return series  # fallback; shouldn’t happen
    df["OpponentTeam"] = (
        df.groupby(["MatchID", "Map"])["Team"]
          .transform(get_opponent)
    )

    # ---------- one-hot encode three inputs --------------------------------
    X = pd.get_dummies(
            df[["Player", "Map", "OpponentTeam"]],
            drop_first=False
        )
    y = df["Kills"]
    groups = df["MatchID"]        # keep matches intact in split
    return X, y, groups, df

# ------------------------------------------------------------
# 2.  Load any region file
#     (swap path to "amer.csv" / "apac.csv" etc.)
# ------------------------------------------------------------
X, y, groups, raw = build_player_map_opponent_dataset("emea.csv")

# ------------------------------------------------------------
# 3.  Train / test split – group-aware (80 / 20)
# ------------------------------------------------------------
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
tr_idx, te_idx = next(gss.split(X, y, groups))
X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]

# ------------------------------------------------------------
# 4.  Fit XGBoost
# ------------------------------------------------------------
model = XGBRegressor(
    max_depth=6,
    n_estimators=400,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    random_state=0,
    n_jobs=4
)
model.fit(X_tr, y_tr)

# ------------------------------------------------------------
# 5.  Evaluate
# ------------------------------------------------------------
pred = model.predict(X_te)
print(f"MAE : {mean_absolute_error(y_te, pred):.2f}")
print(f"R²  : {r2_score(y_te, pred):.3f}")

# ------------------------------------------------------------
# 6.  Inspect a few predictions
# ------------------------------------------------------------
preview = (raw.iloc[te_idx]
           .loc[:, ["MatchID", "Player", "Team", "OpponentTeam", "Map", "Kills"]]
           .assign(Pred=np.round(pred, 2))
           .head(15))
print("\nSample predictions:")
print(preview.to_string(index=False))


MAE : 5.84
R²  : -0.179

Sample predictions:
 MatchID    Player Team OpponentTeam      Map  Kills      Pred
       9   MrFaliN  FUT           M8   Icebox     17 15.700000
       9      xeus  FUT           M8   Icebox     19 20.360001
       9     qRaxs  FUT           M8   Icebox     13 12.430000
       9 AtaKaptan  FUT           M8   Icebox     13 13.970000
       9   yetujey  FUT           M8   Icebox     11 14.000000
       9     kamyk   M8          FUT   Icebox     22 12.330000
       9   K4DAVRA   M8          FUT   Icebox     17 13.240000
       9     Minny   M8          FUT   Icebox     16 14.990000
       9    Zyppan   M8          FUT   Icebox     11  8.860000
       9     Click   M8          FUT   Icebox     19 11.010000
       9 AtaKaptan  FUT           M8 Fracture     23 15.180000
       9      xeus  FUT           M8 Fracture     29 21.340000
       9   MrFaliN  FUT           M8 Fracture     28 16.830000
       9   yetujey  FUT           M8 Fracture     26 15.210000
       9  

In [17]:
import joblib

# after `model.fit(X_tr, y_tr)` is done
joblib.dump(model, "xgb_player_map_opponent.pkl")
X.columns.to_series().to_pickle("model_columns.pkl")   # exact column order


In [18]:
import pandas as pd
import numpy as np
import joblib

def predict_kills(player, map_name, opp_team,
                  model_path="xgb_player_map_opponent.pkl",
                  cols_path="model_columns.pkl"):
    # --- load model & column index ----------------------------
    model = joblib.load(model_path)
    cols  = pd.read_pickle(cols_path)          # Index of training columns

    # --- create 1-row DataFrame with the three inputs ----------
    row = pd.DataFrame(
        {"Player":[player], "Map":[map_name], "OpponentTeam":[opp_team]}
    )

    # one-hot encode using the *same* logic as training
    row_enc = pd.get_dummies(row, drop_first=False)

    # add any missing columns, keep correct order
    row_enc = row_enc.reindex(columns=cols, fill_value=0)

    # --- predict ----------------------------------------------
    pred = model.predict(row_enc)[0]
    return pred


In [20]:
predict_kills("nAts", "Icebox", "NAVI")


16.241686

### TEST 2

In [74]:
import pandas as pd, numpy as np, joblib
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# ── helper ──────────────────────────────────────────────────────────────
def rounds_from_score(s: str) -> float:
    try:
        a, b = s.replace("-", "-").split("-")   # handle weird dash
        return int(a) + int(b)
    except Exception:
        return np.nan                           # filled later
def build_xy(csv_path: str):
    keep = ["MatchID", "Map", "Team", "Player", "Kills", "MapScore"]
    df   = pd.read_csv(csv_path, usecols=keep)

    # 1. numeric map length
    df["RoundsPlayed"] = df["MapScore"].str.replace("-", "-").str.split("-").apply(
        lambda ab: pd.to_numeric(ab).sum() if isinstance(ab, list) else np.nan
    )
    df["RoundsPlayed"].fillna(df["RoundsPlayed"].median(), inplace=True)

    # 2. create OpponentTeam  (⇄ swap the two team strings)
    def swap(series):
        uniq = series.unique()
        if len(uniq) == 2:
            return series.replace({uniq[0]: uniq[1], uniq[1]: uniq[0]})
        return np.nan                          # safeguard
    df["OpponentTeam"] = df.groupby(["MatchID", "Map"])["Team"].transform(swap)

    # make sure we didn't accidentally introduce NaNs
    assert df["OpponentTeam"].isna().sum() == 0, "OpponentTeam has NaNs!"

    # 3. one-hot encode Player, Map, OpponentTeam
    Xcat = pd.get_dummies(df[["Player", "Map", "OpponentTeam"]], drop_first=False)
    X    = pd.concat([Xcat, df[["RoundsPlayed"]].astype(float)], axis=1)
    y    = df["Kills"]
    groups = df["MatchID"]
    return X, y, groups, df


In [78]:
import joblib
import pandas as pd

# 1) save the XGBoost model
joblib.dump(model, "kills_model.pkl")

# 2) save the exact training column order (X is your design-matrix DataFrame)
pd.Series(X.columns).to_pickle("kills_model_cols.pkl")

print("✅  kills_model.pkl and kills_model_cols.pkl written to disk.")


✅  kills_model.pkl and kills_model_cols.pkl written to disk.


In [79]:
def predict_kills(player, map_name, opponent_team, rounds_played,
                  model_path="kills_model.pkl",
                  cols_path="kills_model_cols.pkl") -> float:
    """Return predicted kills for the given inputs."""
    model = joblib.load(model_path)
    cols  = pd.read_pickle(cols_path)     # training column order

    row = pd.DataFrame({
        "Player":       [player],
        "Map":          [map_name],
        "OpponentTeam": [opponent_team],
        "RoundsPlayed": [float(rounds_played)]
    })

    row_enc = pd.get_dummies(row, drop_first=False)
    row_enc = row_enc.reindex(columns=cols, fill_value=0)   # add missing dummies

    return model.predict(row_enc)[0]

# ── demo ───────────────────────────────────────────────────────────────
est = predict_kills("kamo", "Split", "FNC", 24)
print(f"Predicted kills: {est:.1f}")


Predicted kills: 15.3


In [80]:
import pandas as pd
print(sorted(pd.read_pickle("kills_model_cols.pkl")
             .str.replace("OpponentTeam_", "")
             .loc[lambda s: s.str.startswith("OpponentTeam_")]))


[]


In [73]:
import pandas as pd

df  = pd.read_csv("emea.csv", usecols=["MatchID", "Map", "Team"])
# same transformation we used in make_xy()
df["OpponentTeam"] = (
    df.groupby(["MatchID", "Map"])["Team"]
      .transform(lambda s: s.replace(dict(zip(s.unique(), s.unique()[::-1]))) )
)

print(df[["Team", "OpponentTeam"]].head(10))
print(df["OpponentTeam"].isna().mean(), "fraction NaNs")


  Team OpponentTeam
0  VIT           GX
1  VIT           GX
2  VIT           GX
3  VIT           GX
4  VIT           GX
5   GX          VIT
6   GX          VIT
7   GX          VIT
8   GX          VIT
9   GX          VIT
0.0 fraction NaNs


### TEST 3

In [1]:
import pandas as pd
emea_df = pd.read_csv("emea.csv")
amer_df = pd.read_csv("amer.csv")
cn_df   = pd.read_csv("cn.csv")
apac_df = pd.read_csv("apac.csv")

emea_df["Region"] = "EMEA"
amer_df["Region"] = "AMER"
cn_df["Region"]   = "CN"
apac_df["Region"] = "APAC"

# combine all regions into one DataFrame
all_df = pd.concat([emea_df, amer_df, cn_df, apac_df], ignore_index=True)
all_df["MatchID"] = all_df["MatchID"].astype(str)
all_df["MatchID"] = all_df["MatchID"] + "_" + all_df["Region"]
all_df["Player"] = all_df["Player"].str.lower()
all_df["Agent"] = all_df["Agent"].str.lower()
all_df.to_csv("all_regions.csv", index=False)



In [2]:
all_df.head()

Unnamed: 0,Map,Team,Player,Agent,ACS,Kills,Deaths,Assists,FirstKills,Winner,MapScore,Date,MatchID,MatchURL,Region
0,Split,VIT,sayf,tejo,272.0,21,13,4,2.0,True,13‑9,2025-03-26,1_EMEA,https://www.vlr.gg/459826/team-vitality-vs-gia...,EMEA
1,Split,VIT,cyvoph,omen,225.0,18,12,11,1.0,True,13‑9,2025-03-26,1_EMEA,https://www.vlr.gg/459826/team-vitality-vs-gia...,EMEA
2,Split,VIT,less,cypher,166.0,13,14,5,1.0,True,13‑9,2025-03-26,1_EMEA,https://www.vlr.gg/459826/team-vitality-vs-gia...,EMEA
3,Split,VIT,kicks,breach,152.0,11,14,7,0.0,True,13‑9,2025-03-26,1_EMEA,https://www.vlr.gg/459826/team-vitality-vs-gia...,EMEA
4,Split,VIT,derke,yoru,161.0,12,18,4,7.0,True,13‑9,2025-03-26,1_EMEA,https://www.vlr.gg/459826/team-vitality-vs-gia...,EMEA


In [3]:
"""
Train an XGBoost model that predicts a player's kills on a map
from   Player  +  Map  +  OpponentTeam  +  RoundsPlayed.
Saves:
    kills_model.pkl        – trained XGBRegressor
    kills_model_cols.pkl   – ordered Index of one-hot feature columns
"""

import pandas as pd, numpy as np, joblib, os
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# -----------------------------------------------------------------------
# 0.  CONFIGURATION  (adjust if you like)
# -----------------------------------------------------------------------

CSV_PATH   = "all_regions.csv"       # <-- change to amer.csv, apac.csv ...
TEST_SIZE  = 0.20                   # 20 % of matches for evaluation
RANDOM_SEED = 42
MODEL_OUT  = "kills_model.pkl"
COLS_OUT   = "kills_model_cols.pkl"

# -----------------------------------------------------------------------
# 1.  Helper: "6-13"  ->  19 rounds
# -----------------------------------------------------------------------
def rounds_from_score(s: str) -> float:
    try:
        a, b = s.replace("-", "-").split("-")   # handles weird dash char
        return int(a) + int(b)
    except Exception:
        return np.nan

# -----------------------------------------------------------------------
# 2.  Build design matrix X, target y, and grouping key
# -----------------------------------------------------------------------
def build_xy(csv_path: str):
    usecols = ["MatchID", "Map", "Team", "Player", "Kills", "MapScore"]
    df = pd.read_csv(csv_path, usecols=usecols)

    # 2.1 numeric match length
    df["RoundsPlayed"] = df["MapScore"].apply(rounds_from_score)
    df["RoundsPlayed"].fillna(df["RoundsPlayed"].median(), inplace=True)

    # 2.2 derive OpponentTeam by swapping the two teams on each (Match,Map)
    def swap(series):
        t1, t2 = series.unique()[:2]           # exactly two unique teams
        return series.replace({t1: t2, t2: t1})
    df["OpponentTeam"] = (
        df.groupby(["MatchID", "Map"])["Team"]
          .transform(swap)
    )

    # 2.3 one-hot encode the 3 categoricals; keep numeric RoundsPlayed
    X_cat = pd.get_dummies(df[["Player", "Map", "OpponentTeam"]],
                           drop_first=False)
    X = pd.concat([X_cat, df[["RoundsPlayed"]].astype(float)], axis=1)
    y = df["Kills"]
    groups = df["MatchID"]                     # keep matches intact in split
    return X, y, groups

X, y, groups = build_xy(CSV_PATH)

# -----------------------------------------------------------------------
# 3.  Group-aware train/test split  (no leakage between same match)
# -----------------------------------------------------------------------
gss = GroupShuffleSplit(test_size=TEST_SIZE,
                        n_splits=1,
                        random_state=RANDOM_SEED)
tr_idx, te_idx = next(gss.split(X, y, groups))
X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]

# -----------------------------------------------------------------------
# 4.  Fit XGBoost regressor
# -----------------------------------------------------------------------
model = XGBRegressor(
    max_depth=5,
    n_estimators=400,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    n_jobs=4,
    random_state=RANDOM_SEED
).fit(X_tr, y_tr)

# -----------------------------------------------------------------------
# 5.  Evaluate
# -----------------------------------------------------------------------
print(f"Test MAE : {mean_absolute_error(y_te, model.predict(X_te)):.2f}")
print(f"Test R²  : {r2_score(y_te,      model.predict(X_te)):.3f}")

# -----------------------------------------------------------------------
# 6.  Save artefacts
# -----------------------------------------------------------------------
joblib.dump(model, MODEL_OUT)
pd.Series(X.columns).to_pickle(COLS_OUT)
print(f"✅  Saved: {MODEL_OUT}, {COLS_OUT}  ({len(X.columns)} features)")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["RoundsPlayed"].fillna(df["RoundsPlayed"].median(), inplace=True)


Test MAE : 4.48
Test R²  : -0.028
✅  Saved: kills_model.pkl, kills_model_cols.pkl  (310 features)


In [4]:


MODEL_PATH = "kills_model.pkl"
COLS_PATH  = "kills_model_cols.pkl"

def predict_kills(player: str,
                  map_name: str,
                  opponent_team: str,
                  rounds_played: int | float,
                  model_path: str = MODEL_PATH,
                  cols_path: str = COLS_PATH) -> float:
    """Return estimated kills (float)."""
    # 1. load model + training column order
    model = joblib.load(model_path)
    cols  = pd.read_pickle(cols_path)   # pandas Index

    # 2. build 1-row DataFrame with the four inputs
    row = pd.DataFrame({
        "Player":       [player.lower()],
        "Map":          [map_name],
        "OpponentTeam": [opponent_team],
        "RoundsPlayed": [float(rounds_played)]
    })

    # 3. one-hot encode & align columns
    row_enc = pd.get_dummies(row, drop_first=False)
    row_enc = row_enc.reindex(columns=cols, fill_value=0)

    # 4. predict
    return model.predict(row_enc)[0]

# # ── demo ────────────────────────────────────────────────────────────────
# if __name__ == "__main__":
#     est = predict_kills("paTiTek", "Split", "FNC", 23)
#     print(f"Predicted kills: {est:.1f}")


In [5]:
import pandas as pd, numpy as np, re, joblib
from pathlib import Path

def score_kills_csv(csv_path,
                    model_path="kills_model.pkl",
                    cols_path="kills_model_cols.pkl",
                    out_path=None):

    df = pd.read_csv(csv_path)

    # ---- 0) canonicalise the four must-have columns ------------------
    ren = {
        "Match": "MatchID",
        r"map(|name)|map_title":               "Map",
        r"team$|my[_\s]?team|squad":           "Team",
        r"player(|name)|gamer(tag)?|nick":     "Player",
    }
    for pat, new in ren.items():
        for c in df.columns:
            if re.fullmatch(pat, c, flags=re.I):
                df = df.rename(columns={c: new})

    required = {"MatchID", "Map", "Team", "Player"}
    if not required.issubset(df.columns):
        missing = required - set(df.columns)
        raise ValueError(f"CSV still missing columns: {missing}")

    # ---- 1) RoundsPlayed (parse if absent) ---------------------------
    if "RoundsPlayed" not in df.columns:
        if "MapScore" not in df.columns:
            raise ValueError("Need RoundsPlayed or MapScore column.")
        df["RoundsPlayed"] = (
            df["MapScore"].astype(str)
              .apply(lambda s: sum(map(int, re.findall(r"\d+", s)[:2]))
                                  if re.search(r"\d+.*\d+", s) else np.nan)
        )
    df["RoundsPlayed"] = pd.to_numeric(df["RoundsPlayed"], errors="coerce")

    # ---- 2) OpponentTeam by swapping ---------------------------------
    def swap(s):
        u = s.unique()
        if len(u) != 2:
            raise ValueError("Each MatchID–Map must have exactly 2 teams")
        return s.replace({u[0]: u[1], u[1]: u[0]})
    df["OpponentTeam"] = df.groupby(["MatchID", "Map"])["Team"].transform(swap)

    # ---- 3) build feature matrix & score -----------------------------
    X = pd.get_dummies(df[["Player", "Map", "OpponentTeam", "RoundsPlayed"]],
                       drop_first=False)
    X = X.reindex(columns=pd.read_pickle(cols_path), fill_value=0)

    df["PredKills"] = joblib.load(model_path).predict(X)

    if out_path:
        Path(out_path).parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(out_path, index=False)

    return df


In [6]:
scored = score_kills_csv("test.csv", out_path="test_scored.csv")
est = predict_kills("Xeppa", "Lotus", "MIBR", 28)
print(f"Predicted kills: {est:.1f}")

Predicted kills: 14.7


### TESTING ON BETTING DATAFRAME

In [7]:
from datetime import datetime, timedelta
import joblib


In [8]:
%run ../aryan/elo_implementation.ipynb

FNATIC Ascent elo on 2025-02-15: 100

TEAM LIQUID map ratings (as of 2025-02-15):
{}

Final Elo table:
Team      Map    Elo
100T   Ascent 106.75
100T Fracture 110.62
100T    Haven  61.20
100T   Icebox  99.68
100T    Pearl  79.80
100T    Split 103.44
  2G   Ascent 110.85
  2G    Haven  83.79
  2G   Icebox  78.48
  2G    Lotus  93.78
  2G    Pearl  96.48
  AG Fracture 100.60
  AG    Lotus  87.63
  AG    Pearl  55.16
  AG    Split  95.40
 APK   Ascent  84.24
 APK    Haven  95.40
 APK    Lotus  68.41
 APK    Split  85.97
 BBL   Ascent 104.60
 BBL Fracture  92.59
 BBL    Haven 133.02
 BBL   Icebox  95.15
 BBL    Lotus 111.19
 BBL    Pearl 110.63
 BLG   Ascent 120.20
 BLG Fracture 101.94
 BLG    Haven  75.53
 BLG   Icebox 107.35
 BLG    Lotus 125.47
 BLG    Pearl 123.26
 BLG    Split  87.60
 BME   Ascent  84.52
 BME Fracture  85.01
 BME    Haven 104.15
 BME   Icebox 109.87
 BME    Lotus 105.99
 BME    Pearl 107.20
 BME    Split  85.57
  C9   Ascent 109.49
  C9 Fracture 103.96
  C9    Haven 1

In [9]:
vct_teams_fullname_to_abbr = {
    # VCT Americas
    "100 THIEVES": "100T",
    "CLOUD9": "C9",
    "EVIL GENIUSES": "EG",
    "FURIA": "FURIA",
    "G2 ESPORTS": "G2",
    "KRÜ ESPORTS": "KRÜ",
    "LEVIATÁN": "LEV",
    "LOUD": "LOUD",
    "MIBR": "MIBR",
    "NRG": "NRG",
    "SENTINELS": "SEN",
    "2GAME ESPORTS": "2G",

    # VCT EMEA
    "BBL ESPORTS": "BBL",
    "FNATIC": "FNC",
    "FUT ESPORTS": "FUT",
    "GIANTX": "GX",
    "TEAM HERETICS": "HER",
    "KARMINE CORP": "KC",
    "KOI": "KOI",
    "NATUS VINCERE": "NAVI",
    "TEAM LIQUID": "TL",
    "TEAM VITALITY": "VIT",
    "GENTLE MATES": "GM",
    "APEKS" : "APK",

    # VCT Pacific (APAC)
    "BLEED ESPORTS": "BLEED",
    "DETONATION FOCUSME": "DFM",
    "DRX": "DRX",
    "GLOBAL ESPORTS": "GE",
    "GEN.G ESPORTS": "GEN.G",
    "PAPER REX": "PRX",
    "REX REGUM QEON": "RRQ",
    "T1": "T1",
    "TALON ESPORTS": "TALON",
    "TEAM SECRET": "TS",
    "ZETA DIVISION": "ZETA",
    "BOOM ESPORTS": "BME",

    # VCT China
    "ALL GAMERS": "AG",
    "BILIBILI GAMING": "BLG",
    "EDWARD GAMING": "EDG",
    "FUNPLUS PHOENIX": "FPX",
    "JD GAMING": "JDG",
    "NOVA ESPORTS": "NOVA",
    "TITAN ESPORTS CLUB": "TEC",
    "TRACE ESPORTS": "TRACE",
    "TYLOO": "TYLOO",
    "WOLVES ESPORTS": "WOLVES",
    "DRAGON RANGER GAMING": "DRG",
    "XI LAI GAMING": "XLG"

}
def get_team_abbreviation(team_name: str) -> str:
    """Return the team abbreviation for a given team name."""
    return vct_teams_fullname_to_abbr.get(team_name.upper(), team_name.upper())


In [13]:
# --- Load your dataframes here ---
betting_df = pd.read_csv('../leo/valorant_projections_2025-05-24_18-20-05.csv')          # The betting lines dataframe
stats_df = pd.read_csv('../anthony/all_regions.csv')            # The match-level stats dataframe

def data_frame_preprocessing(betting_df, stats_df):
    """
    Preprocess the betting and stats dataframes to prepare for kill predictions.
    
    Args:
        betting_df (pd.DataFrame): DataFrame containing betting lines.
        stats_df (pd.DataFrame): DataFrame containing match statistics.
        
    Returns:
        pd.DataFrame: Processed betting DataFrame with predicted kills.
    """

    # --- Step 1: Normalize player names ---
    betting_df['player_name'] = betting_df['player_name'].str.lower()

    # --- Step 2: Determine the most recent team for each player ---
    # Convert player column to lowercase in stats_df for merging
    stats_df['Player_lower'] = stats_df['Player'].str.lower()

    # Sort by date to get the most recent match
    stats_df['Date'] = pd.to_datetime(stats_df['Date'])
    most_recent_team = stats_df.sort_values('Date').drop_duplicates('Player_lower', keep='last')
    player_team_map = dict(zip(most_recent_team['Player_lower'], most_recent_team['Team']))

    # Add team info to betting_df
    betting_df['team'] = betting_df['player_name'].map(player_team_map)

    # --- Step 3: Extract opponent team ---
    betting_df['opponent_team'] = betting_df['opponent'].str.extract(r'^(.*?)\s+MAP', expand=False)


    # --- Step 4: Set match date to tomorrow ---
    # All betting lines are assumed to be for tomorrow
    return betting_df


# --- Step 5: Predict kills ---
def get_kill_prediction(row):
    player = row['player_name'].title()  # Convert to match stats_df
    team = row['team']
    team = vct_teams_fullname_to_abbr.get(str(team).upper(), team)
    opponent = row['opponent_team']
    opponent = vct_teams_fullname_to_abbr.get(str(opponent).upper(), opponent)
    # Normalize opponent name
    match_date = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')
    stat_type = row['stat_type']

    if pd.isna(team) or pd.isna(opponent):
        return None

    try:
        map_result = simulate_map_selection(team, opponent, match_date)
    except Exception as e:
        print(f"Map sim failed for {team} vs {opponent}: {e}")
        return None

    # DEFAULT maps_1-2_kills
    maps_to_play = [map_result["PickA"], map_result["PickB"]]

    if stat_type == "maps_1-3_kills" or stat_type == "map_1_2_3_kills":
        remaining_pool = [m for m in ALL_MAPS if m not in {
            map_result["BanA"], map_result["BanB"], map_result["PickA"], map_result["PickB"]
        }]
        if remaining_pool:
            try:
                decider_map = max(remaining_pool, key=lambda m: elo_hist.rating(team, m, pd.to_datetime(match_date)))
                maps_to_play.append(decider_map)
            except:
                pass  # If Elo fails, just skip decider
    elif stat_type == "map_3_kills":
        # Only simulate the decider map (3rd map)
        remaining_pool = [m for m in ALL_MAPS if m not in {
            map_result["BanA"], map_result["BanB"], map_result["PickA"], map_result["PickB"]
        }]
        maps_to_play = []  # reset to ensure only 1 map
        if remaining_pool:
            try:
                decider_map = max(remaining_pool, key=lambda m: elo_hist.rating(team, m, pd.to_datetime(match_date)))
                maps_to_play.append(decider_map)
            except:
                pass  # If Elo fails, just skip prediction


    total_kills = 0
    for map_name in maps_to_play:
        try:
            rounds_played = predict_map_length_by_date(team, opponent, map_name, match_date)
            kills = predict_kills(player, map_name, opponent, rounds_played)
            total_kills += kills
        except Exception as e:
            print(f"Kill prediction failed for {player} on {map_name}: {e}")
            continue

    return total_kills

betting_df = data_frame_preprocessing(betting_df, stats_df)

betting_df['predicted_kills'] = betting_df.apply(get_kill_prediction, axis=1)

# --- Output ---
print(betting_df[['player_name', 'opponent', 'stat_type', 'line', 'predicted_kills']])




   player_name             opponent         stat_type  line  predicted_kills
0       artzin  Evil Geniuses MAP 3       map_3_kills  14.0        15.576711
1        aspas  Evil Geniuses MAP 3       map_3_kills  18.0        19.025024
2     cortezia  Evil Geniuses MAP 3       map_3_kills  15.0        16.299583
3       derrek           MIBR MAP 3       map_3_kills  14.0        15.285875
4       nature           MIBR MAP 3       map_3_kills  12.5        15.285875
..         ...                  ...               ...   ...              ...
72      serial                  NaN  maps_1_2_3_kills  41.5              NaN
73   something                  NaN  maps_1_2_3_kills  46.5              NaN
74   something                  NaN  maps_1_2_3_kills  46.5              NaN
75    suygetsu                  NaN  maps_1_2_3_kills  44.5              NaN
76    suygetsu                  NaN  maps_1_2_3_kills  44.5              NaN

[77 rows x 5 columns]


In [17]:
betting_df[0:25]

Unnamed: 0,player_name,opponent,stat_type,line,sportsbook,team,opponent_team,predicted_kills
0,artzin,Evil Geniuses MAP 3,map_3_kills,14.0,Prizepicks,MIBR,Evil Geniuses,15.576711
1,aspas,Evil Geniuses MAP 3,map_3_kills,18.0,Prizepicks,MIBR,Evil Geniuses,19.025024
2,cortezia,Evil Geniuses MAP 3,map_3_kills,15.0,Prizepicks,MIBR,Evil Geniuses,16.299583
3,derrek,MIBR MAP 3,map_3_kills,14.0,Prizepicks,EG,MIBR,15.285875
4,nature,MIBR MAP 3,map_3_kills,12.5,Prizepicks,EG,MIBR,15.285875
5,supamen,MIBR MAP 3,map_3_kills,14.5,Prizepicks,EG,MIBR,15.915419
6,verno,Evil Geniuses MAP 3,map_3_kills,14.5,Prizepicks,MIBR,Evil Geniuses,15.717527
7,xenom,Evil Geniuses MAP 3,map_3_kills,14.5,Prizepicks,MIBR,Evil Geniuses,16.487379
8,yay,MIBR MAP 3,map_3_kills,16.5,Prizepicks,EG,MIBR,20.201788
9,avez,Team Liquid MAPS 1-3,maps_1-3_kills,45.5,Prizepicks,KC,Team Liquid,40.238153


In [152]:
test = {
    "player_name": "mitch",
    "team" : "CLOUD9",
    "opponent_team": "NRG ESPORTS",
    "stat_type": "maps_1-2_kills",
}
test = pd.DataFrame([test])
test["predicted_kills"] = test.apply(get_kill_prediction, axis=1)
test

Unnamed: 0,player_name,team,opponent_team,stat_type,predicted_kills
0,mitch,CLOUD9,NRG ESPORTS,maps_1-2_kills,28.06881


In [18]:
# Test data loading
test_1 = pd.read_csv('../pp_lines/prizepicks_scraper.py_2025-05-16_17-58-23.csv')
test_2 = pd.read_csv('../pp_lines/prizepicks_scraper.py_2025-05-17_14-14-52.csv')
test_df = pd.concat([test_1, test_2], ignore_index=True)

# Renaming columns to match the expected format
test_df.rename(columns={
    "players": "player_name",
    "opponents": "opponent",
}, inplace=True)

test_df = data_frame_preprocessing(test_df, stats_df)

# running predictions
test_df['predicted_kills'] = test_df.apply(get_kill_prediction, axis=1)
test_df[["player_name", "opponent", "stat_type", "line", "predicted_kills"]][50:75]

Unnamed: 0,player_name,opponent,stat_type,line,predicted_kills
50,boostio,NRG MAPS 1-2,MAPS 1-2 Kills,24.5,24.956358
51,eeiu,NRG MAPS 1-2,MAPS 1-2 Kills,30.5,30.199291
52,zander,NRG MAPS 1-2,MAPS 1-2 Kills,28.5,31.634556
53,skuba,100 Thieves MAPS 1-2,MAPS 1-2 Kills,30.5,
54,mada,100 Thieves MAPS 1-2,MAPS 1-2 Kills,34.5,29.505079
55,s0m,100 Thieves MAPS 1-2,MAPS 1-2 Kills,29.5,28.572479
56,ethan,100 Thieves MAPS 1-2,MAPS 1-2 Kills,28.5,29.820284
57,brawk,100 Thieves MAPS 1-2,MAPS 1-2 Kills,28.5,29.492775
58,oxy,KRÜ Esports MAPS 1-2,MAPS 1-2 Kills,38.5,30.560198
59,v1c,KRÜ Esports MAPS 1-2,MAPS 1-2 Kills,29.5,30.459755
