In [1]:
# ✅ CELL 1 — LIBRARIES & IMPORTS

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

from lightgbm import LGBMRegressor, early_stopping
import shap
import matplotlib.pyplot as plt

print("✅ Libraries loaded.")


✅ Libraries loaded.


In [None]:
# ✅ CELL 2 — LOAD DATA

df = pd.read_csv('/content/sample_data/mw_pw_profiles.csv')
print("Raw shape:", df.shape)

# Drop irrelevant/text columns
df.drop([
    'gender', 'balls_per_over', 'series_name', 'name_x', 'name_y',
    'unique_name', 'key_bcci', 'key_bcci_2', 'key_bigbash', 'key_cricbuzz',
    'key_cricheroes', 'key_crichq', 'key_cricinfo', 'key_cricinfo_2',
    'key_cricinfo_3', 'key_cricingif', 'key_cricketarchive',
    'key_cricketarchive_2', 'key_cricketworld', 'key_nvplay',
    'key_nvplay_2', 'key_opta', 'key_opta_2', 'key_pulse', 'key_pulse_2',
    'full_name', 'teams'
], axis=1, inplace=True)

# Label encode player_id and player_team ONLY
le = LabelEncoder()
df['player_id'] = le.fit_transform(df['player_id'])
df['player_team'] = le.fit_transform(df['player_team'])

print("✅ player_id and player_team encoded.")

# ✅ Keep opposition_team as TEXT for rolling!
# We'll encode/drop later.

# One-hot match type
df = pd.get_dummies(df, columns=['match_type'], drop_first=True, dtype=int)

print("✅ After basic clean:", df.shape)
df.head()


  df = pd.read_csv('/content/sample_data/mw_pw_profiles.csv')


Raw shape: (333848, 58)
✅ player_id and player_team encoded.
✅ After basic clean: (333848, 35)


Unnamed: 0,player_id,match_id,start_date,runs_scored,player_out,balls_faced,fours_scored,sixes_scored,catches_taken,run_out_direct,...,bowling_style,playing_role,fantasy_score_batting,fantasy_score_bowling,fantasy_score_total,match_type_MDM,match_type_ODI,match_type_ODM,match_type_T20,match_type_Test
0,922,1158348,21-08-2018,10.0,1.0,30.0,1.0,0.0,1.0,0.0,...,,,5.0,8.0,17.0,0,0,0,1,0
1,922,1182644,05-05-2019,8.0,1.0,18.0,1.0,0.0,0.0,0.0,...,,,3.0,0.0,7.0,0,0,0,1,0
2,922,1275107,09-09-2021,1.0,0.0,5.0,0.0,0.0,0.0,0.0,...,,,1.0,0.0,5.0,0,0,0,1,0
3,922,1275113,15-09-2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.0,4.0,0,0,0,1,0
4,922,1275125,14-09-2021,1.0,1.0,15.0,0.0,0.0,0.0,0.0,...,,,-5.0,0.0,-1.0,0,0,0,1,0


In [None]:
# ✅ CELL 3 — BASE RATE FEATURES

df['strike_rate'] = (df['runs_scored'] / df['balls_faced']) * 100
df['economy_rate'] = (df['runs_conceded'] / df['balls_bowled']) * 100
df['economy'] = (df['runs_conceded'] / df['balls_bowled']) * 6

df.drop([
    'balls_faced', 'runs_conceded', 'balls_bowled',
    'player_out', 'dot_balls_as_batsman', 'dot_balls_as_bowler',
    'fantasy_score_batting', 'fantasy_score_bowling', 'out_kind',
    'order_seen'
], axis=1, inplace=True)

df['start_date'] = pd.to_datetime(df['start_date'], format="%d-%m-%Y")

print("✅ Base features done:", df.shape)
df.head()


✅ Base features done: (333848, 28)


Unnamed: 0,player_id,match_id,start_date,runs_scored,fours_scored,sixes_scored,catches_taken,run_out_direct,run_out_throw,stumpings_done,...,playing_role,fantasy_score_total,match_type_MDM,match_type_ODI,match_type_ODM,match_type_T20,match_type_Test,strike_rate,economy_rate,economy
0,922,1158348,2018-08-21,10.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,17.0,0,0,0,1,0,33.333333,,
1,922,1182644,2019-05-05,8.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,7.0,0,0,0,1,0,44.444444,,
2,922,1275107,2021-09-09,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,5.0,0,0,0,1,0,20.0,,
3,922,1275113,2021-09-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,4.0,0,0,0,1,0,,,
4,922,1275125,2021-09-14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,-1.0,0,0,0,1,0,6.666667,,


In [None]:
# ✅ CELL 4 — ROLLING EWM

features_to_span = [
    'runs_scored', 'fours_scored', 'sixes_scored', 'catches_taken',
    'run_out_direct', 'run_out_throw', 'stumpings_done', 'wickets_taken',
    'bowled_done', 'lbw_done', 'maidens', 'strike_rate', 'economy_rate', 'economy'
]

spans = [1, 5, 10]

for feat in features_to_span:
    for span in spans:
        df[f"{feat}_{span}"] = (
            df.groupby('player_id')[feat]
            .transform(lambda x: x.shift(1).ewm(span=span, adjust=False).mean())
        )

print("✅ Rolling EWM done:", df.shape)
df.head()


✅ Rolling EWM done: (333848, 70)


Unnamed: 0,player_id,match_id,start_date,runs_scored,fours_scored,sixes_scored,catches_taken,run_out_direct,run_out_throw,stumpings_done,...,maidens_10,strike_rate_1,strike_rate_5,strike_rate_10,economy_rate_1,economy_rate_5,economy_rate_10,economy_1,economy_5,economy_10
0,922,1158348,2018-08-21,10.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
1,922,1182644,2019-05-05,8.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,33.333333,33.333333,33.333333,,,,,,
2,922,1275107,2021-09-09,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,44.444444,37.037037,35.353535,,,,,,
3,922,1275113,2021-09-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,31.358025,32.561983,,,,,,
4,922,1275125,2021-09-14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,31.358025,32.561983,,,,,,


In [None]:
# ✅ CELL 5 — ADVANCED + ROUND 4

print("✅ Adding advanced ratios, slopes, momentum, opposition...")

# Safe divide helper
def safe_div(num, den):
    return (num / den).replace([np.inf, -np.inf], 0).fillna(0)

# Ratios
df['runs_scored_1_vs_5'] = safe_div(df['runs_scored_1'], df['runs_scored_5'])
df['runs_scored_1_vs_10'] = safe_div(df['runs_scored_1'], df['runs_scored_10'])
df['runs_scored_5_vs_10'] = safe_div(df['runs_scored_5'], df['runs_scored_10'])

# Career avg runs
df['career_avg_runs'] = (
    df.groupby('player_id')['runs_scored']
    .transform(lambda x: x.shift(1).expanding().mean())
).fillna(0)

# Trend slope
def calc_trend_slope(x):
    x = x.dropna()
    if len(x) < 2: return 0
    X = np.arange(len(x)).reshape(-1, 1)
    y = x.values
    return LinearRegression().fit(X, y).coef_[0]

df['trend_slope_5'] = (
    df.groupby('player_id')['runs_scored']
    .transform(lambda x: x.shift(1).rolling(5, min_periods=2).apply(calc_trend_slope, raw=False))
).fillna(0)

df['days_since_last'] = df.groupby('player_id')['start_date'].diff().dt.days.fillna(0)

# Strike rate lags & diffs
df['strike_rate_5'] = df.groupby('player_id')['strike_rate'].transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))
df['strike_rate_10'] = df.groupby('player_id')['strike_rate'].transform(lambda x: x.rolling(10, min_periods=1).mean().shift(1))
df['strike_rate_momentum'] = df['strike_rate_5'] / (df['strike_rate_10'] + 1e-6)
df['career_avg_strike_rate'] = df.groupby('player_id')['strike_rate'].transform('mean')
df['strike_rate_diff_5'] = df['strike_rate_5'] - df['career_avg_strike_rate']

# Economy lags & diffs
df['economy_rate_5'] = df.groupby('player_id')['economy_rate'].transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))
df['economy_rate_10'] = df.groupby('player_id')['economy_rate'].transform(lambda x: x.rolling(10, min_periods=1).mean().shift(1))
df['economy_rate_momentum'] = df['economy_rate_5'] / (df['economy_rate_10'] + 1e-6)
df['economy_rate_diff_5'] = df['economy_rate_5'] - df['economy_rate']

# Interactions
df['strike_rate_T20'] = df['strike_rate'] * df['match_type_T20']
df['strike_rate_Test'] = df['strike_rate'] * df['match_type_Test']
df['economy_rate_ODI'] = df['economy_rate'] * df['match_type_ODI']

# ✅ Opposition trends
df['opp_runs_conceded_5'] = df.groupby('opposition_team')['runs_scored'].transform(
    lambda x: x.rolling(5, min_periods=1).mean().shift(1)
)
df['opp_wickets_conceded_5'] = df.groupby('opposition_team')['wickets_taken'].transform(
    lambda x: x.rolling(5, min_periods=1).mean().shift(1)
)

print("✅ Advanced + momentum + opposition features done:", df.shape)

# Drop opposition_team now
df.drop(['opposition_team'], axis=1, inplace=True)
df.fillna(0, inplace=True)


✅ Adding advanced ratios, slopes, momentum, opposition...
✅ Advanced + momentum + opposition features done: (333848, 86)


  df.fillna(0, inplace=True)


In [None]:
# ✅ CELL 6: FINAL TRAIN-TEST SPLIT (LEAK-PROOF)

print("Splitting train/test...")

# Make sure start_date is datetime
df['start_date'] = pd.to_datetime(df['start_date'])

cutoff_date = pd.to_datetime('2024-07-01')

# ✅ 1️⃣ Train/test split
train_df = df[df['start_date'] < cutoff_date].copy()
test_df = df[df['start_date'] >= cutoff_date].copy()

print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")

# ✅ 2️⃣ Align columns if needed
for col in set(train_df.columns) - set(test_df.columns):
    test_df[col] = 0
for col in set(test_df.columns) - set(train_df.columns):
    train_df[col] = 0
test_df = test_df[train_df.columns]

# ✅ 3️⃣ Remove ALL leaky columns (but KEEP the target!)
bad_cols = [col for col in train_df.columns if (
    ('fantasy' in col.lower() and col != 'fantasy_score_total')
    or col in [
        'runs_scored', 'wickets_taken', 'fours_scored', 'sixes_scored',
        'bowled_done', 'lbw_done', 'maidens', 'strike_rate', 'economy_rate', 'economy'
    ]
)]
print("🚫 Dropping leakers BEFORE split:", bad_cols)

train_df = train_df.drop(bad_cols, axis=1, errors='ignore')
test_df = test_df.drop(bad_cols, axis=1, errors='ignore')

# ✅ 4️⃣ Now safely split out X/y
X_train = train_df.drop(['fantasy_score_total', 'match_id', 'start_date'], axis=1, errors='ignore')
y_train = train_df['fantasy_score_total']

X_test = test_df.drop(['fantasy_score_total', 'match_id', 'start_date'], axis=1, errors='ignore')
y_test = test_df['fantasy_score_total']

# ✅ Final check: should be empty lists
print(f"Remaining fantasy cols: {[col for col in X_train.columns if 'fantasy' in col.lower()]}")
print(f"Remaining direct stat cols: {[col for col in X_train.columns if col in ['runs_scored', 'wickets_taken', 'fours_scored', 'sixes_scored', 'bowled_done', 'lbw_done', 'maidens']]}")


Splitting train/test...
Train shape: (299006, 85) | Test shape: (34842, 85)
🚫 Dropping leakers BEFORE split: ['runs_scored', 'fours_scored', 'sixes_scored', 'wickets_taken', 'bowled_done', 'lbw_done', 'maidens', 'strike_rate', 'economy_rate', 'economy']
Remaining fantasy cols: []
Remaining direct stat cols: []


In [None]:
!pip install pyswarm

Collecting pyswarm
  Downloading pyswarm-0.6.tar.gz (4.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyswarm
  Building wheel for pyswarm (setup.py) ... [?25l[?25hdone
  Created wheel for pyswarm: filename=pyswarm-0.6-py3-none-any.whl size=4463 sha256=7c7e6a56e2a22e2e61d53363632a0c5f5ed9ae5cfbb78d7cef041d216cbff688
  Stored in directory: /root/.cache/pip/wheels/bb/4f/ec/8970b83323e16aa95034da175454843947376614d6d5e9627f
Successfully built pyswarm
Installing collected packages: pyswarm
Successfully installed pyswarm-0.6


In [None]:
from pyswarm import pso
from sklearn.metrics import mean_squared_error
import numpy as np

# List of all features
feature_names = [f for f in X_train.columns if f not in ['strike_rate', 'economy_rate', 'economy']]
num_features = len(feature_names)

# Objective function for PSO: mask selects features
def objective(mask):
    # Convert float mask to binary (0 or 1)
    mask_bin = np.round(mask).astype(int)
    if np.sum(mask_bin) == 0:
        return 1e6  # Penalize empty feature subset

    # Select features based on mask
    selected_features = [f for i, f in enumerate(feature_names) if mask_bin[i]==1]

    # Train simple LightGBM on selected features
    model = LGBMRegressor(
        colsample_bytree=0.8, learning_rate=0.01, max_depth=10,
        n_estimators=300, num_leaves=50, subsample=0.7, random_state=42
    )
    model.fit(X_train[selected_features], y_train)
    y_pred = model.predict(X_test[selected_features])

    # Return RMSE as fitness score
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Lower and upper bounds: each feature can be 0 or 1
lb = [0] * num_features
ub = [1] * num_features

# Run PSO
best_mask, best_score = pso(objective, lb, ub, swarmsize=20, maxiter=10)

# Convert to binary and get selected features
best_mask_bin = np.round(best_mask).astype(int)
best_features = [f for i, f in enumerate(feature_names) if best_mask_bin[i]==1]

print("✅ PSO selected features:", best_features)
print(f"✅ PSO best RMSE: {best_score:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6638
[LightGBM] [Info] Number of data points in the train set: 37942, number of used features: 30
[LightGBM] [Info] Start training from score 44.503532
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6694
[LightGBM] [Info] Number of data points in the train set: 37942, number of used features: 37
[LightGBM] [Info] Start training from score 44.503532
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [None]:
best_lgbm = LGBMRegressor(
    colsample_bytree=0.8, learning_rate=0.01, max_depth=10,
    n_estimators=1000, num_leaves=50, subsample=0.7, random_state=42
)
best_lgbm.fit(X_train[best_features], y_train)

y_pred = best_lgbm.predict(X_test[best_features])

# Metrics
print("✅ Final LightGBM with PSO-selected features:")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")


NameError: name 'best_features' is not defined

In [None]:
# 1️⃣ Get the final list of feature column names used in LightGBM training
final_features = X_train.columns.tolist()

# 2️⃣ Create a new DataFrame with ONLY those features from the full dataset
features_df = df[final_features].copy()


In [None]:
# ✅ CELL 7 — FINAL MODEL TRAIN & METRICS

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("Training final LightGBM with Round 4 features...")

best_lgbm = LGBMRegressor(
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=1000,
    num_leaves=50,
    subsample=0.7,
    random_state=42
)

best_lgbm.fit(X_train, y_train)

y_pred = best_lgbm.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\n✅ FINAL LightGBM Performance:")
print(f"  MAE : {mae:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  R²  : {r2:.4f}")


In [None]:
# 📌 CELL 9: SHAP EXPLAIN

import shap

print("Running SHAP explain...")

# ✅ Make sure your SHAP version matches LightGBM
explainer = shap.Explainer(best_lgbm)
shap_values = explainer(X_test)

# ✅ SHAP summary plot
shap.summary_plot(shap_values, X_test, max_display=15)


In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("🏷️ Running PRUNE test: with vs. without player_id ...")

# 1️⃣ Pick your top SHAP features — for demo, let's say top 15
top_features = [
    'strike_rate', 'economy_rate', 'economy_rate_diff_5',
    'opp_runs_conceded_5', 'strike_rate_momentum', 'trend_slope_5',
    'days_since_last', 'catches_taken_10', 'career_avg_strike_rate',
    'catches_taken_5', 'strike_rate_diff_5', 'runs_scored_5_vs_10',
    'player_id', 'career_avg_runs', 'strike_rate_10'
]

# 2️⃣ Split WITH player_id
X_train_with_id = X_train[top_features].copy()
X_test_with_id = X_test[top_features].copy()

# 3️⃣ Split WITHOUT player_id
top_features_no_id = [f for f in top_features if f != 'player_id']
X_train_no_id = X_train[top_features_no_id].copy()
X_test_no_id = X_test[top_features_no_id].copy()

# 4️⃣ Train LightGBM WITH player_id
model_with_id = LGBMRegressor(
    colsample_bytree=0.8, learning_rate=0.01, max_depth=10,
    n_estimators=1000, num_leaves=50, subsample=0.7, random_state=42
)
model_with_id.fit(X_train_with_id, y_train)
y_pred_with_id = model_with_id.predict(X_test_with_id)

# 5️⃣ Train LightGBM WITHOUT player_id
model_no_id = LGBMRegressor(
    colsample_bytree=0.8, learning_rate=0.01, max_depth=10,
    n_estimators=1000, num_leaves=50, subsample=0.7, random_state=42
)
model_no_id.fit(X_train_no_id, y_train)
y_pred_no_id = model_no_id.predict(X_test_no_id)

# 6️⃣ Compare
print("\n✅ PRUNED LightGBM WITH player_id:")
print(f"  MAE : {mean_absolute_error(y_test, y_pred_with_id):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_with_id)):.4f}")
print(f"  R²  : {r2_score(y_test, y_pred_with_id):.4f}")

print("\n✅ PRUNED LightGBM WITHOUT player_id:")
print(f"  MAE : {mean_absolute_error(y_test, y_pred_no_id):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_no_id)):.4f}")
print(f"  R²  : {r2_score(y_test, y_pred_no_id):.4f}")


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("⚡️ Running STACK vs BEST single LightGBM...")

# ✅ 1️⃣ Split a holdout from X_train to train meta model on out-of-folds
X_base_train, X_meta, y_base_train, y_meta = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# ✅ 2️⃣ First, train base LightGBM on base split
base_lgbm = LGBMRegressor(
    colsample_bytree=0.8, learning_rate=0.01, max_depth=10,
    n_estimators=1000, num_leaves=50, subsample=0.7, random_state=42
)
base_lgbm.fit(X_base_train, y_base_train)

# ✅ 3️⃣ Get base preds for meta training + test
meta_train_preds = base_lgbm.predict(X_meta)
meta_test_preds = base_lgbm.predict(X_test)

# ✅ 4️⃣ Meta model: simple ridge to blend base preds & stats
# You can add more meta features if you want — here we keep it clean
meta_X_train = np.vstack([meta_train_preds]).T
meta_X_test = np.vstack([meta_test_preds]).T

meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_X_train, y_meta)

# ✅ Final stack prediction
stack_preds = meta_model.predict(meta_X_test)

# ✅ Also fit a single best LightGBM on full X_train for direct comparison
best_lgbm_direct = LGBMRegressor(
    colsample_bytree=0.8, learning_rate=0.01, max_depth=10,
    n_estimators=1000, num_leaves=50, subsample=0.7, random_state=42
)
best_lgbm_direct.fit(X_train, y_train)
best_lgbm_preds = best_lgbm_direct.predict(X_test)

# ✅ Metrics side by side
print("\n✅ STACKED LightGBM + Ridge meta:")
print(f"  MAE : {mean_absolute_error(y_test, stack_preds):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, stack_preds)):.4f}")
print(f"  R²  : {r2_score(y_test, stack_preds):.4f}")

print("\n✅ SINGLE LightGBM (all features, full data):")
print(f"  MAE : {mean_absolute_error(y_test, best_lgbm_preds):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, best_lgbm_preds)):.4f}")
print(f"  R²  : {r2_score(y_test, best_lgbm_preds):.4f}")


In [None]:
import pandas as pd

def predict_22_players_using_all_history(player_ids, full_df, model, X_train_cols, agg='mean'):
    """
    Args:
        player_ids (list[int]): The 22 player IDs.
        full_df (pd.DataFrame): The full historical dataset with engineered features.
        model: The trained LightGBM model.
        X_train_cols (list): Training feature columns.
        agg (str): How to aggregate ('mean', 'sum', 'latest').

    Returns:
        pd.DataFrame: player_id and final predicted score, sorted high to low.
    """

    # ✅ 1. Filter only rows for given player IDs
    players_df = full_df[full_df['player_id'].isin(player_ids)].copy()
    if players_df.empty:
        raise ValueError("❌ None of the given player_ids found in the dataframe!")

    # ✅ 2. Fill any missing feature columns
    for col in X_train_cols:
        if col not in players_df.columns:
            players_df[col] = 0

    X = players_df[X_train_cols]

    # ✅ 3. Predict for every row
    preds = model.predict(X)
    players_df['predicted'] = preds

    # ✅ 4. Aggregate per player
    if agg == 'mean':
        final_scores = players_df.groupby('player_id')['predicted'].mean().reset_index()
    elif agg == 'sum':
        final_scores = players_df.groupby('player_id')['predicted'].sum().reset_index()
    elif agg == 'latest':
        players_df = players_df.sort_values('start_date')
        final_scores = players_df.groupby('player_id').tail(1)[['player_id', 'predicted']]
    else:
        raise ValueError(f"❌ Unknown aggregation method: {agg}")

    # ✅ 5. Sort high to low
    final_scores = final_scores.sort_values('predicted', ascending=False).reset_index(drop=True)

    return final_scores


In [None]:
result = predict_22_players_using_all_history(
    player_ids=[1,2,3,4,5,6,7,8,9,10, 11,12,13,14,15,16,17,18,19,20,21,22],
    full_df=df,
    model=best_lgbm,
    X_train_cols=X_train.columns.tolist(),
    agg='mean'  # or 'sum' or 'latest'
)

print(result)


In [None]:
import pandas as pd
import numpy as np

# ✅ 1️⃣ Top-11 MAE calculation function
def top11_mae(y_true, y_pred, player_names):
    df = pd.DataFrame({
        'actual': y_true,
        'predicted': y_pred,
        'name': player_names
    })

    top11_actual = df.sort_values('actual', ascending=False).head(11)
    top11_pred = df.sort_values('predicted', ascending=False).head(11)

    merged = pd.merge(
        top11_actual[['name', 'actual']],
        top11_pred[['name', 'predicted']],
        how='outer',
        on='name'
    ).fillna(0)

    merged['abs_diff'] = np.abs(merged['actual'] - merged['predicted'])

    mae_sum = merged['abs_diff'].sum()

    print("\n✅ Top-11 Comparison Table:")
    print(merged)

    return mae_sum

# ✅ 2️⃣ Inference pipeline function
def predict_top11_for_match(match_id, cutoff_date, model, full_df):
    match_players = full_df[full_df['match_id'] == match_id].copy()
    print(f"🔍 Found {len(match_players)} rows for match_id {match_id}")

    match_players = match_players[match_players['start_date'] < pd.to_datetime(cutoff_date)]
    print(f"🔍 After cutoff: {len(match_players)} rows remain")

    if match_players.empty:
        raise ValueError(f"❌ No valid players for match {match_id} before {cutoff_date}")

    X_cols = X_train.columns.tolist()

    # Fill missing
    missing_cols = set(X_cols) - set(match_players.columns)
    print(f"🧩 Missing cols for this match: {missing_cols}")

    for col in missing_cols:
        match_players[col] = 0

    X_match = match_players[X_cols].copy()

    y_pred = model.predict(X_match)
    y_true = match_players['fantasy_score_total'].values
    player_names = match_players['player_id'].values

    result = pd.DataFrame({
        'player_id': player_names,
        'predicted': y_pred,
        'actual': y_true
    }).sort_values('predicted', ascending=False).head(11)

    print("\n✅ Top-11 Predicted Players:")
    print(result)

    return result


In [None]:
import pandas as pd
import numpy as np

K = 11

def evaluate_match(df, k=K):
    # ✅ 1. Actual total for best possible team
    actual_top_k = df['fantasy_score_total'].nlargest(k).values
    actual_top_k_sum = actual_top_k.sum()

    # ✅ 2. Indices of predicted top k by model
    idx_pred_top_k = df['predicted_fantasy_score_total_exp'].nlargest(k).index

    # ✅ 3. Actual scores of the predicted top k players
    model1_actual_sum = df.loc[idx_pred_top_k, 'fantasy_score_total'].sum()

    # ✅ 4. Model overlap % (same as before)
    overlap_pct = model1_actual_sum / actual_top_k_sum if actual_top_k_sum != 0 else 0

    # ✅ 5. Predicted scores of predicted top k players
    predicted_top_k = df.loc[idx_pred_top_k, 'predicted_fantasy_score_total_exp'].values

    # ✅ 6. MAPE: mean absolute % difference between rank-wise top k
    abs_diff = abs(actual_top_k - predicted_top_k).sum()
    mape = abs_diff / actual_top_k_sum if actual_top_k_sum != 0 else 0

    return pd.Series({
        'actual_top_11_sum': actual_top_k_sum,
        'model1_actual_sum': model1_actual_sum,
        'model1_%_of_optimal': overlap_pct,
        'abs_diff': abs_diff,
        'mape': mape,  # This is the per-match MAPE, but not what you want finally
    })

# Add predictions
test_df_with_preds = test_df.copy()
test_df_with_preds['predicted_fantasy_score_total_exp'] = best_lgbm.predict(X_test)

# Apply per match
evaluation_df = (
    test_df_with_preds
    .groupby('match_id', group_keys=False)
    .apply(evaluate_match)
    .reset_index()
)

# ✅ Compute final overall MAPE: total abs diff over total actual sum
total_abs_diff = evaluation_df['abs_diff'].sum()
total_actual_top_11_sum = evaluation_df['actual_top_11_sum'].sum()
overall_mape = 100*total_abs_diff / total_actual_top_11_sum if total_actual_top_11_sum != 0 else 0

print(evaluation_df.head())
print(f"✅ Overall MAPE for the entire model: {overall_mape:.4f}")


ValueError: Number of features of the model must match the input. Model n_features_ is 34 and input n_features is 72

In [None]:
!pip install catboost

In [None]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

base_learners = [
    ('linear', LinearRegression()),
    ('bayes_ridge', BayesianRidge()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('extratrees', ExtraTreesRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42)),
    ('catboost', CatBoostRegressor(verbose=0, random_state=42)),
    ('svm', SVR()),
    ('knn', KNeighborsRegressor()),
    ('mlp', MLPRegressor(max_iter=500, random_state=42))
]


In [None]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)

model_scores = []

for name, model in base_learners:
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    mean_rmse = np.mean(np.sqrt(-scores))
    model_scores.append((name, mean_rmse))
    print(f"{name}: Mean CV RMSE = {mean_rmse:.4f}")

# Sort by lowest RMSE
model_scores.sort(key=lambda x: x[1])
selected_models = [name for name, score in model_scores[:6]]
print("\nSelected top 6 models:", selected_models)


In [None]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge

selected_learners = [
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42)),
    ('catboost', CatBoostRegressor(verbose=0, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('extratrees', ExtraTreesRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42)),
    ('bayes_ridge', BayesianRidge())
]


In [None]:
from sklearn.ensemble import StackingRegressor

# Filter the selected base learners
#selected_learners = [learner for learner in base_learners if learner[0] in selected_models]

# Meta model: LightGBM
meta_model = LGBMRegressor(n_estimators=100, random_state=42)

# Stacking Regressor
stacked_model = StackingRegressor(
    estimators=selected_learners,
    final_estimator=meta_model,
    cv=5,
    passthrough=False,  # True passes original features too, try both later!
    n_jobs=-1
)

# Fit
stacked_model.fit(X_train, y_train)

# Predict on test set
y_pred_stacked = stacked_model.predict(X_test)


In [None]:
import numpy as np

# Custom metric again
top_11_pred_idx = np.argsort(y_pred_stacked)[-11:]
top_11_actual_idx = np.argsort(y_test)[-11:]

actual_scores_of_pred_top11 = y_test.iloc[top_11_pred_idx].sum()
actual_scores_of_actual_top11 = y_test.iloc[top_11_actual_idx].sum()

diff_ratio = (actual_scores_of_pred_top11 - actual_scores_of_actual_top11) / actual_scores_of_actual_top11

print(f"Stacked Ensemble Top-11 Diff Ratio: {diff_ratio:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(y_test, bins=20, alpha=0.7, label='Actual')
plt.title('Actual Scores')
plt.xlabel('Score')

plt.subplot(1, 2, 2)
plt.hist(y_pred_stacked, bins=20, alpha=0.7, label='Predicted', color='orange')
plt.title('Stacked Model Predicted Scores')
plt.xlabel('Score')

plt.tight_layout()
plt.show()


In [None]:
results = []

for name, model in selected_learners:
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Custom metric
    top_11_pred_idx = np.argsort(y_pred)[-11:]
    top_11_actual_idx = np.argsort(y_test)[-11:]

    actual_scores_pred_top11 = y_test.iloc[top_11_pred_idx].sum()
    actual_scores_actual_top11 = y_test.iloc[top_11_actual_idx].sum()

    diff_ratio = (actual_scores_pred_top11 - actual_scores_actual_top11) / actual_scores_actual_top11

    results.append((name, diff_ratio))
    print(f"{name} - Top-11 Diff Ratio: {diff_ratio:.4f}")

# Sort results best to worst
results.sort(key=lambda x: abs(x[1]))
print("\n=== Single Base Model Results ===")
for name, score in results:
    print(f"{name}: Diff Ratio = {score:.4f}")


In [None]:
import numpy as np

# --- Assume y_test and y_pred are defined ---
# For best LGBM model
y_pred_lgbm = best_lgbm.predict(X_test)

K = 11

# ✅ 1. Get actual top K scores (true top 11)
actual_top_k = y_test.nlargest(K).values
actual_top_k_sum = actual_top_k.sum()

# ✅ 2. Get indices of predicted top K
idx_pred_top_k = np.argsort(y_pred_lgbm)[-K:]

# ✅ 3. Get actual scores of the predicted top K
predicted_top_k_actuals = y_test.iloc[idx_pred_top_k].values
predicted_top_k_actuals_sum = predicted_top_k_actuals.sum()

# ✅ 4. Diff Ratio
diff_ratio = (predicted_top_k_actuals_sum - actual_top_k_sum) / actual_top_k_sum

# ✅ 5. Top-11 MAE (Sum-based)
# Just the total absolute difference divided by 11
mae_sum_based = abs(predicted_top_k_actuals_sum - actual_top_k_sum) / K

# ✅ 6. Top-11 MAE (Rank-wise)
# Sort both for rank-wise alignment
actual_top_k_sorted = np.sort(actual_top_k)[::-1]
predicted_top_k_actuals_sorted = np.sort(predicted_top_k_actuals)[::-1]

rankwise_abs_diff = np.abs(actual_top_k_sorted - predicted_top_k_actuals_sorted)
mae_rankwise = rankwise_abs_diff.mean()

# ✅ Print nicely
print("🔹 LGBM Best Model Metrics 🔹")
print(f"Top-11 Diff Ratio: {diff_ratio:.4f}")
print(f"Top-11 MAE (Sum-based): {mae_sum_based:.4f}")
print(f"Top-11 MAE (Rank-wise): {mae_rankwise:.4f}")
print("Actual top 11 sorted:", actual_top_k_sorted)
print("Predicted top 11 actuals sorted:", predicted_top_k_actuals_sorted)



In [None]:
# Scale predictions
y_pred_scaled = 0.5 * y_pred_lgbm

# Get indices of new predicted top 11
idx_pred_top_k_scaled = np.argsort(y_pred_scaled)[-K:]
predicted_top_k_actuals_scaled = y_test.iloc[idx_pred_top_k_scaled].values

# Same metrics
predicted_top_k_actuals_sum_scaled = predicted_top_k_actuals_scaled.sum()
diff_ratio_scaled = (predicted_top_k_actuals_sum_scaled - actual_top_k_sum) / actual_top_k_sum
mae_sum_based_scaled = abs(predicted_top_k_actuals_sum_scaled - actual_top_k_sum) / K

actual_top_k_sorted = np.sort(actual_top_k)[::-1]
predicted_top_k_actuals_sorted_scaled = np.sort(predicted_top_k_actuals_scaled)[::-1]
rankwise_abs_diff_scaled = np.abs(actual_top_k_sorted - predicted_top_k_actuals_sorted_scaled)
mae_rankwise_scaled = rankwise_abs_diff_scaled.mean()

print("🔹 Scaled Predictions 🔹")
print(f"Top-11 Diff Ratio: {diff_ratio_scaled:.4f}")
print(f"Top-11 MAE (Sum-based): {mae_sum_based_scaled:.4f}")
print(f"Top-11 MAE (Rank-wise): {mae_rankwise_scaled:.4f}")


In [None]:
import gradio as gr
import pandas as pd

# Use your real helper!
def select_best_team(player_ids_text):
    try:
        # 1️⃣ Parse input
        player_ids = [int(pid.strip()) for pid in player_ids_text.split(',')]
        if len(player_ids) != 22:
            return f"❌ Please enter exactly 22 player IDs, you gave {len(player_ids)}"

        # 2️⃣ Call your real function
        final_scores = predict_22_players_using_all_history(
            player_ids=player_ids,
            full_df=df,
            model=best_lgbm,
            X_train_cols=X_train.columns.tolist(),
            agg='mean'  # Or 'latest' or 'sum'
        )

        # 3️⃣ Pick top 11
        selected_team = final_scores.head(11)

        # 4️⃣ Format output
        result = "✅ **Selected Best 11:**\n\n"
        for idx, row in selected_team.iterrows():
            result += f"• Player ID: {row['player_id']} → Predicted Score: {row['predicted']:.1f}\n"

        return result

    except Exception as e:
        return f"❌ Error: {e}"

# ✅ Gradio Interface
demo = gr.Interface(
    fn=select_best_team,
    inputs=gr.Textbox(
        lines=2,
        placeholder="Example: 101, 102, 103, ..., 122",
        label="Enter 22 Player IDs (comma-separated)"
    ),
    outputs=gr.Markdown(),
    title="🏏 Fantasy Team Selector",
    description=(
        "Enter **22 player IDs** for your match squad. "
        "This app predicts scores using your trained model "
        "and selects the best possible **11-player team**."
    ),
    examples=[
        ["1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22"]
    ]
)

demo.launch()


In [None]:
class Dream11Env:
    def __init__(self, player_pool, budget_limit=100):
        self.player_pool = player_pool
        self.budget_limit = budget_limit
        self.reset()

    def reset(self):
        self.selected_players = []
        self.done = False
        return self._get_state()

    def _get_state(self):
        mask = self.player_pool['player_id'].isin(self.selected_players).astype(int)

        # ✅ Keep only important features to reduce state size
        important_columns = ['fantasy_score_total', 'avg_points', 'recent_form']
        state = self.player_pool[important_columns].copy()
        state['selected'] = mask

        state = state.apply(pd.to_numeric, errors='coerce').fillna(0)
        return state.values.flatten().astype(np.float32)

    def step(self, action):
        if self.done or action in self.selected_players:
            return self._get_state(), -100.0, self.done, {}  # Invalid: duplicate or finished

        self.selected_players.append(action)

        if len(self.selected_players) == 11:
            self.done = True

            team_df = self.player_pool[self.player_pool['player_id'].isin(self.selected_players)]
            total_points = team_df['fantasy_score_total'].sum()

            if len(set(team_df['team'])) > 7:
                return self._get_state(), -500.0, self.done, {}  # Invalid: too many teams

            return self._get_state(), total_points, self.done, {}

        return self._get_state(), 0.0, self.done, {}


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, x):
        return self.net(x)

class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99):
        self.model = DQN(state_dim, action_dim)
        self.target = DQN(state_dim, action_dim)
        self.target.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.memory = deque(maxlen=1000)  # ✅ Smaller buffer
        self.gamma = gamma
        self.action_dim = action_dim

    def act(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, self.action_dim - 1)
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state)
        return q_values.argmax().item()

    def push(self, transition):
        self.memory.append(transition)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def train_step(self, batch_size):
        if len(self.memory) < batch_size:
            return

        batch = self.sample(batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(np.array(states))
        actions = torch.LongTensor(np.array(actions)).unsqueeze(1)
        rewards = torch.FloatTensor(np.array(rewards)).unsqueeze(1)
        next_states = torch.FloatTensor(np.array(next_states))
        dones = torch.FloatTensor(np.array(dones)).unsqueeze(1)

        q_values = self.model(states).gather(1, actions)
        with torch.no_grad():
            q_next = self.target(next_states).max(1)[0].unsqueeze(1)
            target_q = rewards + self.gamma * q_next * (1 - dones)

        loss = nn.MSELoss()(q_values, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ✅ Free memory
        del states, actions, rewards, next_states, dones, q_values, q_next, target_q, loss
        torch.cuda.empty_cache()

    def update_target(self):
        self.target.load_state_dict(self.model.state_dict())


In [None]:
import time

env = Dream11Env(player_pool=df, budget_limit=100)
state_dim = env._get_state().shape[0]
action_dim = len(df)

agent = DQNAgent(state_dim, action_dim)

num_episodes = 500
batch_size = 64
reward_history = []

for episode in range(num_episodes):
    start_time = time.time()

    state = env.reset()
    total_reward = 0

    while True:
        action = agent.act(state, epsilon=0.1)
        next_state, reward, done, _ = env.step(action)

        agent.push((state, action, reward, next_state, done))
        agent.train_step(batch_size)

        state = next_state
        total_reward += reward

        if done:
            break

    reward_history.append(total_reward)

    if episode % 20 == 0:
        agent.update_target()
        print(f"Episode {episode}: Total Reward = {total_reward:.2f} "
              f"(Time: {time.time() - start_time:.2f} sec)")


KeyError: "['avg_points', 'recent_form'] not in index"

Episode 0: Total Reward = 82442.00
