In [131]:
import numpy as np
import pandas as pd
import pickle

In [132]:
match = pd.read_csv("matches.csv")
delivery = pd.read_csv("deliveries.csv")

In [133]:
total_score_df = (
    delivery
    .groupby(['match_id','inning'])['total_runs']
    .sum()
    .reset_index()
)

total_score_df = total_score_df[total_score_df['inning'] == 1]

In [134]:
match_df = match.merge(
    total_score_df[['match_id','total_runs']],
    left_on='id',
    right_on='match_id'
)

In [135]:
match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils','Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

In [136]:
teams = [
    'Sunrisers Hyderabad','Mumbai Indians','Royal Challengers Bangalore',
    'Kolkata Knight Riders','Kings XI Punjab','Chennai Super Kings',
    'Rajasthan Royals','Delhi Capitals'
]

match_df = match_df[
    (match_df['team1'].isin(teams)) &
    (match_df['team2'].isin(teams))
]

In [137]:
match_df = match_df[match_df['dl_applied'] == 0]
match_df = match_df[['match_id','city','winner','total_runs']]

In [138]:
delivery_df = match_df.merge(delivery, on='match_id').copy()

delivery_df = delivery_df[delivery_df['inning'] == 2]

In [139]:
delivery_df.rename(columns={
    'total_runs_x':'target',
    'total_runs_y':'ball_runs'
}, inplace=True)

In [140]:
delivery_df['current_score'] = delivery_df.groupby('match_id')['ball_runs'].cumsum()

In [141]:
delivery_df['runs_left'] = delivery_df['target'] - delivery_df['current_score']

In [142]:
delivery_df['balls_left'] = 120 - ((delivery_df['over']-1)*6 + delivery_df['ball'])

In [143]:
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].notna().astype(int)

delivery_df['wickets'] = 10 - delivery_df.groupby('match_id')['player_dismissed'].cumsum()

In [144]:
delivery_df['crr'] = delivery_df['current_score']*6 / (120 - delivery_df['balls_left'])
delivery_df['rrr'] = delivery_df['runs_left']*6 / delivery_df['balls_left']

In [145]:
delivery_df['result'] = (delivery_df['batting_team'] == delivery_df['winner']).astype(int)

In [146]:
delivery_df = delivery_df[delivery_df['ball'] == 6]

In [147]:
final_df = delivery_df[
    ['batting_team','bowling_team',
     'runs_left','balls_left','wickets',
     'target','crr','rrr','result']
].copy()

final_df.replace([np.inf,-np.inf],np.nan,inplace=True)
final_df.dropna(inplace=True)

In [148]:
from sklearn.model_selection import train_test_split

X = final_df.drop('result',axis=1)
y = final_df['result']

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,random_state=42
)

In [149]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [150]:
trf = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'),
        ['batting_team','bowling_team']),
    ('num','passthrough',
        ['runs_left','balls_left','wickets','target','crr','rrr'])
])

In [153]:
pipe = Pipeline([
    ('trf', trf),
    ('model', RandomForestClassifier(
        n_estimators=120,   # was 300
        max_depth=10,
        n_jobs=-1,          # ⭐ uses all CPU cores (IMPORTANT)
        random_state=42
    ))
])

In [154]:
pipe.fit(X_train,y_train)

from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, pipe.predict(X_test)))

Accuracy: 0.8598047914818101


In [156]:
import pickle

with open("pipe.pkl", "wb") as f:
    pickle.dump(pipe, f)

print("Model saved successfully ✅")

Model saved successfully ✅


In [157]:
import os
print(os.listdir())

['.ipynb_checkpoints', 'deliveries.csv', 'matches.csv', 'most_runs_average_strikerate.csv', 'pipe.pkl', 'Players.xlsx', 'teams.csv', 'teamwise_home_and_away.csv', 'Untitled.ipynb']


In [158]:
import pickle
pickle.dump(pipe, open('pipe.pkl','wb'))

In [161]:
# ===============================
# PERFECT SAFE COLUMN MATCH
# ===============================

expected_cols = pipe.feature_names_in_

row = {
    'batting_team': batting,
    'bowling_team': bowling,
    'city': city,
    'runs_left': runs_left,
    'balls_left': balls_left,
    'wickets': wickets_left,
    'total_runs_x': target,   # keep name same as training
    'crr': crr,
    'rrr': rrr
}

df = pd.DataFrame([row])

# force exact same structure as training
df = df.reindex(columns=expected_cols)

NameError: name 'batting' is not defined

In [160]:
import pickle

pipe = pickle.load(open("pipe.pkl","rb"))

print(pipe.feature_names_in_)

['batting_team' 'bowling_team' 'runs_left' 'balls_left' 'wickets' 'target'
 'crr' 'rrr']


In [163]:
batting = "Mumbai Indians"
bowling = "Chennai Super Kings"
city = "Mumbai"

runs_left = 50
balls_left = 30
wickets_left = 6
target = 180
crr = 7.5
rrr = 10.0

In [164]:
expected_cols = pipe.feature_names_in_

row = {
    'batting_team': batting,
    'bowling_team': bowling,
    'city': city,
    'runs_left': runs_left,
    'balls_left': balls_left,
    'wickets': wickets_left,
    'total_runs_x': target,
    'crr': crr,
    'rrr': rrr
}

df = pd.DataFrame([row])
df = df.reindex(columns=expected_cols)
df = df.fillna(0)   # ⭐ prevents ALL NaN crashes

In [None]:
# =============================
# SAFE CALCULATIONS
# =============================

runs_left = max(target - score, 0)
balls_left = max(120 - int(overs * 6), 1)   # never 0
wickets_left = max(10 - wickets, 0)

crr = score / overs if overs > 0 else 0
rrr = (runs_left * 6) / balls_left if balls_left > 0 else 0


# =============================
# SAFE DATAFRAME CREATION
# =============================

expected_cols = pipe.feature_names_in_

row = {
    'batting_team': batting,
    'bowling_team': bowling,
    'city': city,
    'runs_left': float(runs_left),
    'balls_left': float(balls_left),
    'wickets': float(wickets_left),
    'total_runs_x': float(target),
    'crr': float(crr),
    'rrr': float(rrr)
}

df = pd.DataFrame([row])

# match training structure
df = df.reindex(columns=expected_cols)

# ⭐ CRITICAL (removes NaN + inf)
df = df.fillna(0)
df = df.replace([float('inf'), -float('inf')], 0)