In [1]:
import pandas as pd
import numpy as np
import pickle

print("üîÑ Loading IPL datasets...")

deliveries = pd.read_csv("/Users/arpit/Desktop/IPL/deliveries.csv")
matches = pd.read_csv("/Users/arpit/Desktop/IPL/matches.csv")

print(f"Shape of deliveries (rows, cols): {deliveries.shape}")
print(f"Shape of matches (rows, cols): {matches.shape}")


üîÑ Loading IPL datasets...
Shape of deliveries (rows, cols): (278205, 20)
Shape of matches (rows, cols): (1169, 28)


In [2]:
# Function to auto-detect column name variations
def detect(df, candidates, name):
    for c in candidates:
        if c in df.columns:
            print(f"‚úî Using column '{c}' for {name}")
            return c

# Detect match_id
match_id_col = detect(matches, ["id", "match_id", "matchId", "matchid"], "match_id")

# Detect season column
season_col = detect(matches, ["season", "Season", "year"], "season")

# Detect team1 / team2
team1_col = detect(matches, ["team1", "Team1", "batting_team"], "team1")
team2_col = detect(matches, ["team2", "Team2", "bowling_team"], "team2")

# Detect winner
winner_col = detect(matches, ["winner", "Winner", "match_winner"], "winner")

# Detect venue
venue_col = detect(matches, ["venue", "Venue", "city"], "venue")


‚úî Using column 'matchId' for match_id
‚úî Using column 'season' for season
‚úî Using column 'team1' for team1
‚úî Using column 'team2' for team2
‚úî Using column 'winner' for winner
‚úî Using column 'venue' for venue


In [3]:
# ============================================================
# Basically here we are detecting match ID columns in both datasets
# ============================================================

# Detect match ID column in deliveries.csv
deliveries_match_id = None
for c in ["match_id", "matchId", "id", "matchid", "matchID", "Match_Id"]:
    if c in deliveries.columns:
        deliveries_match_id = c
        print(f"‚úî deliveries match_id column = {c}")
        break

if deliveries_match_id is None:
    raise KeyError("‚ùå Could not find match_id column in deliveries.csv")


# Detect match ID column in matches.csv
matches_match_id = None
for c in ["id", "match_id", "matchId", "matchid", "mid"]:
    if c in matches.columns:
        matches_match_id = c
        print(f"‚úî matches match_id column = {c}")
        break

if matches_match_id is None:
    raise KeyError("‚ùå Could not find match_id column in matches.csv")


‚úî deliveries match_id column = matchId
‚úî matches match_id column = matchId


In [4]:
# Here, we create a smaller version of matches.csv with only the relevant columns
# to be used for training the win predictor model.
cols_to_use = [
    match_id_col,  # Detected in Cell 2
    season_col,    # Detected in Cell 2
    team1_col,     # Detected in Cell 2
    team2_col,     # Detected in Cell 2
    winner_col,    # Detected in Cell 2
    venue_col      # Detected in Cell 2
]
if 'method' in matches.columns:
    cols_to_use.append('method')
    print("‚úî 'method' column found and will be included.")
else:
    print("‚ùå 'method' column not found in matches.csv.")

matches_small = matches[cols_to_use].copy()
matches_small.rename(columns={match_id_col: 'match_id'}, inplace=True)

print(f"‚úÖ 'matches_small' DataFrame created with columns: {matches_small.columns.tolist()}")
matches_small.head()

‚úî 'method' column found and will be included.
‚úÖ 'matches_small' DataFrame created with columns: ['match_id', 'season', 'team1', 'team2', 'winner', 'venue', 'method']


Unnamed: 0,match_id,season,team1,team2,winner,venue,method
0,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,
1,335983,2007/08,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,"Punjab Cricket Association Stadium, Mohali",
2,335984,2007/08,Delhi Daredevils,Rajasthan Royals,Delhi Daredevils,Feroz Shah Kotla,
3,335986,2007/08,Kolkata Knight Riders,Deccan Chargers,Kolkata Knight Riders,Eden Gardens,
4,335985,2007/08,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,Wankhede Stadium,


In [5]:
df = deliveries.merge(matches_small, left_on=deliveries_match_id, right_on="match_id", how="left")
print("üîó Merge successful!")
df.head()


üîó Merge successful!


Unnamed: 0,matchId,inning,over_ball,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,...,dismissal_kind,player_dismissed,date,match_id,season,team1,team2,winner,venue,method
0,335982,1,0.1,0,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,...,,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,
1,335982,1,0.2,0,2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,
2,335982,1,0.3,0,3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,
3,335982,1,0.4,0,4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,
4,335982,1,0.5,0,5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,


In [6]:

# CELL 5 ‚Äî COMPUTE TARGET SCORE FOR EACH MATCH
possible_total_cols = [
    "total_runs", 
    "total", 
    "runs", 
    "runs_total", 
    "ball_runs", 
    "batsman_runs"   
]

total_runs_col = None
for col in possible_total_cols:
    if col in df.columns:
        total_runs_col = col
        print(f"‚úî Using '{col}' as total runs column")
        break


# Compute total runs per inning
inning_totals = (
    df.groupby(["match_id", "inning"])[total_runs_col]
      .sum()
      .reset_index()
      .rename(columns={total_runs_col: "inning_runs"})
)

# First innings = target for second innings
target_df = inning_totals[inning_totals["inning"] == 1][["match_id", "inning_runs"]]
target_df.columns = ["match_id", "target"]

# Merge into main df
df = df.merge(target_df, on="match_id", how="left")

print("üéØ Target scores merged successfully!")
df.head()


‚úî Using 'batsman_runs' as total runs column
üéØ Target scores merged successfully!


Unnamed: 0,matchId,inning,over_ball,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,...,player_dismissed,date,match_id,season,team1,team2,winner,venue,method,target
0,335982,1,0.1,0,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
1,335982,1,0.2,0,2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
2,335982,1,0.3,0,3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
3,335982,1,0.4,0,4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
4,335982,1,0.5,0,5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205


In [7]:

def detect_col(df, candidates, label):
  
    for c in candidates:
        if c in df.columns:
            print(f"‚úî Using '{c}' for {label}")
            return c
        

        
# Auto detect OVER column
over_col = detect_col(
    df,
    ["over", "overs", "ball_over", "Over"],
    "over"
)

# Auto detect BALL column
ball_col = detect_col(
    df,
    ["ball", "delivery", "balls", "Ball"],
    "ball"
)

# Auto detect WICKET column
wicket_col = detect_col(
    df,
    ["player_dismissed", "dismissed_player", "isWicketDelivery", "wicket", "wickets"],
    "wicket/dismissal"
)

print("\n‚úî FINAL detected columns:")
print("‚Ä¢ over       =", over_col)
print("‚Ä¢ ball       =", ball_col)
print("‚Ä¢ wicket     =", wicket_col)

# Filter second innings
df2 = df[df["inning"] == 2].copy()

print("\nüìå Using only SECOND INNINGS data for win probability model")
df2.head()


‚úî Using 'over' for over
‚úî Using 'ball' for ball
‚úî Using 'player_dismissed' for wicket/dismissal

‚úî FINAL detected columns:
‚Ä¢ over       = over
‚Ä¢ ball       = ball
‚Ä¢ wicket     = player_dismissed

üìå Using only SECOND INNINGS data for win probability model


Unnamed: 0,matchId,inning,over_ball,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,...,player_dismissed,date,match_id,season,team1,team2,winner,venue,method,target
124,335982,2,0.1,0,1,Royal Challengers Bangalore,Kolkata Knight Riders,R Dravid,W Jaffer,AB Dinda,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
125,335982,2,0.2,0,2,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,R Dravid,AB Dinda,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
126,335982,2,0.3,0,3,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,R Dravid,AB Dinda,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
127,335982,2,0.4,0,4,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,R Dravid,AB Dinda,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205
128,335982,2,0.5,0,5,Royal Challengers Bangalore,Kolkata Knight Riders,R Dravid,W Jaffer,AB Dinda,...,,2008-04-18,335982,2007/08,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders,M Chinnaswamy Stadium,,205


In [8]:
df2.columns.tolist()


['matchId',
 'inning',
 'over_ball',
 'over',
 'ball',
 'batting_team',
 'bowling_team',
 'batsman',
 'non_striker',
 'bowler',
 'batsman_runs',
 'extras',
 'isWide',
 'isNoBall',
 'Byes',
 'LegByes',
 'Penalty',
 'dismissal_kind',
 'player_dismissed',
 'date',
 'match_id',
 'season',
 'team1',
 'team2',
 'winner',
 'venue',
 'method',
 'target']

In [9]:

# CELL 7 ‚Äî FEATURE ENGINEERING 

# Use only 2nd innings for win probability model
df2 = df[df["inning"] == 2].copy()

# Total runs scored in 1st innings
first_innings_total = (
    df[df["inning"] == 1]
    .groupby("match_id")["batsman_runs"].sum()
    + df[df["inning"] == 1]
    .groupby("match_id")["extras"].sum()
)

# Map target into df2
df2["target"] = df2["match_id"].map(first_innings_total)


# 2Ô∏è‚É£ TOTAL RUNS FOR EACH BALL
df2["total_runs"] = df2["batsman_runs"] + df2["extras"]

# Runs accumulated before this ball
df2["runs_before"] = df2.groupby("match_id")["total_runs"].transform("cumsum")


# 3Ô∏è‚É£ WICKETS BEFORE CURRENT BALL
df2["wickets_before"] = (
    df2["player_dismissed"].notna()
    .groupby(df2["match_id"])
    .transform("cumsum")
)

# 4Ô∏è‚É£ BALL COUNT
df2["ball_number"] = (df2["over"] - 1) * 6 + df2["ball"]

# 5Ô∏è‚É£ CHASE FEATURES
df2["runs_left"] = df2["target"] - df2["runs_before"]
df2["balls_left"] = 120 - df2["ball_number"]

df2["run_rate"] = df2["runs_before"] / (df2["ball_number"] / 6 + 1e-9)
df2["rr_required"] = df2["runs_left"] / (df2["balls_left"] / 6 + 1e-9)


# 6Ô∏è‚É£ MATCH PHASES
df2["is_powerplay"] = (df2["over"] <= 6).astype(int)
df2["is_middle"]    = ((df2["over"] >= 7) & (df2["over"] <= 15)).astype(int)
df2["is_death"]     = (df2["over"] >= 16).astype(int)

# Wickets in hand
df2["wickets_in_hand"] = 10 - df2["wickets_before"]
df2.head()


Unnamed: 0,matchId,inning,over_ball,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,...,wickets_before,ball_number,runs_left,balls_left,run_rate,rr_required,is_powerplay,is_middle,is_death,wickets_in_hand
124,335982,2,0.1,0,1,Royal Challengers Bangalore,Kolkata Knight Riders,R Dravid,W Jaffer,AB Dinda,...,0,-5,221,125,-1.2,10.608,1,0,0,10
125,335982,2,0.2,0,2,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,R Dravid,AB Dinda,...,0,-4,220,124,-3.0,10.645161,1,0,0,10
126,335982,2,0.3,0,3,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,R Dravid,AB Dinda,...,0,-3,220,123,-4.0,10.731707,1,0,0,10
127,335982,2,0.4,0,4,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,R Dravid,AB Dinda,...,0,-2,219,122,-9.0,10.770492,1,0,0,10
128,335982,2,0.5,0,5,Royal Challengers Bangalore,Kolkata Knight Riders,R Dravid,W Jaffer,AB Dinda,...,0,-1,218,121,-24.0,10.809917,1,0,0,10


In [10]:
# WIN/LOSS LABEL

# Team chasing wins if their batting_team matches match winner
df2["won"] = (df2["winner"] == df2["batting_team"]).astype(int)

# Runs_left must NOT be negative
df2 = df2[df2["runs_left"] >= 0]

# Balls_left must NOT be negative
df2 = df2[df2["balls_left"] >= 0]

# Remove any rows with missing essential fields
df2 = df2.dropna(subset=[
    "runs_before",
    "runs_left",
    "balls_left",
    "won"
])

print("‚úÖ Win/Loss labels created successfully!")
print("Final training dataset shape:", df2.shape)


df2[[
    "match_id", 
    "over", 
    "ball", 
    "runs_before", 
    "runs_left", 
    "balls_left", 
    "won"
]].head()


‚úÖ Win/Loss labels created successfully!
Final training dataset shape: (133281, 41)


Unnamed: 0,match_id,over,ball,runs_before,runs_left,balls_left,won
124,335982,0,1,1,221,125,0
125,335982,0,2,2,220,124,0
126,335982,0,3,2,220,123,0
127,335982,0,4,3,219,122,0
128,335982,0,5,4,218,121,0


In [11]:
#ADVANCED FEATURE ENGINEERING

print("üîÑ Engineering 'momentum' features...")

df2['is_wicket'] = df2['player_dismissed'].notna().astype(int)


grouped = df2.groupby('match_id')
window_size = 12

df2['runs_last_12_balls'] = grouped['total_runs'].rolling(window=window_size, min_periods=1).sum().shift(1).reset_index(0, drop=True)
df2['wickets_last_12_balls'] = grouped['is_wicket'].rolling(window=window_size, min_periods=1).sum().shift(1).reset_index(0, drop=True)
df2['runs_last_12_balls'] = df2['runs_last_12_balls'].fillna(0)
df2['wickets_last_12_balls'] = df2['wickets_last_12_balls'].fillna(0)

print("‚úÖ 'runs_last_12_balls' and 'wickets_last_12_balls' created.")


üîÑ Engineering 'momentum' features...
‚úÖ 'runs_last_12_balls' and 'wickets_last_12_balls' created.


In [None]:
import numpy as np
import pandas as pd 
import time, joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb

print("üîÑ Starting boosted training...")
t0 = time.time()

features = [
    "runs_before", "wickets_in_hand", "run_rate",
    "runs_left", "balls_left", "rr_required",
    "is_powerplay", "is_middle", "is_death",
      "runs_last_12_balls", "wickets_last_12_balls"
]

X = df2[features].copy()
y = df2["won"].copy()

#removes infinite values and rows having NaN values
X = X.replace([np.inf, -np.inf], np.nan)
mask = X.notna().all(axis=1)
X = X[mask].copy()
y = y[mask].copy()

# 2) Advanced Cricket Feature Engineering
def boost_features(df):
    df = df.copy()

    # Base helper
    df["balls_left_clip"] = df["balls_left"].clip(lower=1)

    # Basic derived
    df["runs_per_ball_left"] = df["runs_left"] / df["balls_left_clip"]
    df["required_rate_diff"] = df["rr_required"] - df["run_rate"]
    df["runs_before_per_wicket"] = df["runs_before"] / df["wickets_in_hand"].clip(lower=1)
    df["pressure_index"] = df["rr_required"] * (1 + (9 - df["wickets_in_hand"]) / 9)

    # NEW HIGH-IMPACT FEATURES
    df["overs_bowled"] = (120 - df["balls_left"]) / 6.0
    df["wickets_lost"] = 10 - df["wickets_in_hand"]
    df["momentum"] = df["run_rate"] - df["rr_required"]
    df["pressure_ratio"] = df["rr_required"] / df["run_rate"].clip(lower=0.1)
    df["danger_zone"] = ((df["rr_required"] > 9) & (df["wickets_in_hand"] <= 5)).astype(int)

    df["chase_difficulty"] = df["runs_left"] / df["balls_left_clip"] * (df["wickets_lost"] + 1)

    df["phase_numeric"] = (
        0 * df["is_powerplay"] +
        1 * df["is_middle"] +
        2 * df["is_death"]
    )

    df["rr_required_sq"] = df["rr_required"] ** 2

    df.drop(columns=["balls_left_clip"], inplace=True)
    return df

X = boost_features(X)

all_cols = X.columns.tolist()
print(f"> Total engineered features: {len(all_cols)}")


# 3) Split
X_train, X_val, y_train, y_val = train_test_split(
    X[all_cols], y,
    test_size=0.15,
    stratify=y,
    random_state=42 #creates the same split every time code runs
)

# 4) Train one boosted model

model = lgb.LGBMClassifier(
    n_estimators=3000, #number of trees creates
    learning_rate=0.02,
    num_leaves=63, #contols accuracy
    min_child_samples=10, #prevent overfitting
    subsample=0.9, #randomly uses 90% of data
    colsample_bytree=0.9, #randonly uses 90% of features
    random_state=42,
    n_jobs=-1 # for faster training
)

print("üöÄ Training boosted model with early stopping...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="binary_logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=70, verbose=False)]
)


# 5) Evaluate

y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"üéØ Final validation accuracy: {acc * 100:.2f}%")

joblib.dump(model, "boosted_lgb_model.joblib")
print("üíæ Model saved: boosted_lgb_model.joblib")

print(f"‚è± Total time: {time.time()-t0:.1f}s")


üîÑ Starting boosted training...
> Total engineered features: 21
üöÄ Training boosted model with early stopping...
[LightGBM] [Info] Number of positive: 57793, number of negative: 55495
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3324
[LightGBM] [Info] Number of data points in the train set: 113288, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.510142 -> initscore=0.040575
[LightGBM] [Info] Start training from score 0.040575
üéØ Final validation accuracy: 82.20%
üíæ Model saved: boosted_lgb_model.joblib
‚è± Total time: 29.2s


In [19]:
import lightgbm as lgb

win_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=-1,
    random_state=42
)

win_model.fit(X_train, y_train)
print("Model trained!")
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
import pickle

# Save model
with open("win_model.pkl", "wb") as f:
    pickle.dump(win_model, f)

# Save features
with open("win_features.pkl", "wb") as f:
    pickle.dump(features, f)

# Save scaler
with open("win_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Saved all required files!")


[LightGBM] [Info] Number of positive: 57793, number of negative: 55495
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3324
[LightGBM] [Info] Number of data points in the train set: 113288, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.510142 -> initscore=0.040575
[LightGBM] [Info] Start training from score 0.040575
Model trained!
Saved all required files!
