In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupShuffleSplit

In [2]:
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")

matches.shape, deliveries.shape

((756, 18), (179078, 21))

In [6]:
total_score = (
    deliveries
    .groupby(['match_id','inning'])['total_runs']
    .sum()
    .reset_index()
)

total_score = total_score[total_score['inning'] == 1]

total_score.head()

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
2,2,1,184
4,3,1,183
6,4,1,163
8,5,1,157


In [7]:
match_df = matches.merge(
    total_score[['match_id','total_runs']],
    left_on='id',
    right_on='match_id'
)

match_df = match_df[['match_id','city','winner','total_runs']]

match_df.head()

Unnamed: 0,match_id,city,winner,total_runs
0,1,Hyderabad,Sunrisers Hyderabad,207
1,2,Pune,Rising Pune Supergiant,184
2,3,Rajkot,Kolkata Knight Riders,183
3,4,Indore,Kings XI Punjab,163
4,5,Bangalore,Royal Challengers Bangalore,157


In [8]:
delivery_df = match_df.merge(deliveries, on='match_id')
delivery_df = delivery_df[delivery_df['inning'] == 2].copy()

delivery_df.shape

(86240, 24)

In [9]:
delivery_df['current_score'] = delivery_df.groupby('match_id')['total_runs_y'].cumsum()

delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score']

delivery_df['balls_left'] = 120 - (delivery_df['over']*6 + delivery_df['ball'])

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].notna().astype(int)

wickets = delivery_df.groupby('match_id')['player_dismissed'].cumsum()
delivery_df['wickets'] = 10 - wickets

delivery_df['crr'] = delivery_df['current_score']*6 / np.maximum(120-delivery_df['balls_left'],1)
delivery_df['rrr'] = delivery_df['runs_left']*6 / np.maximum(delivery_df['balls_left'],1)

# smart features
delivery_df['pressure'] = delivery_df['rrr'] - delivery_df['crr']
delivery_df['runs_per_wicket'] = delivery_df['runs_left'] / np.maximum(delivery_df['wickets'],1)

delivery_df['result'] = (delivery_df['batting_team']==delivery_df['winner']).astype(int)

delivery_df.head()

Unnamed: 0,match_id,city,winner,total_runs_x,inning,batting_team,bowling_team,over,ball,batsman,...,fielder,current_score,runs_left,balls_left,wickets,crr,rrr,pressure,runs_per_wicket,result
125,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,,1,206,113,10,0.857143,10.938053,10.08091,20.6,0
126,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,,1,206,112,10,0.75,11.035714,10.285714,20.6,0
127,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,,1,206,111,10,0.666667,11.135135,10.468468,20.6,0
128,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,,3,204,110,10,1.8,11.127273,9.327273,20.4,0
129,1,Hyderabad,Sunrisers Hyderabad,207,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,,7,200,109,10,3.818182,11.009174,7.190992,20.0,0


In [10]:
final_df = delivery_df[
    ['match_id','batting_team','bowling_team','city',
     'runs_left','balls_left','wickets','total_runs_x',
     'crr','rrr','pressure','runs_per_wicket','result']
].copy()

final_df = final_df.replace([np.inf,-np.inf],np.nan)
final_df.dropna(inplace=True)

final_df = final_df[(final_df['balls_left']>0) & (final_df['runs_left']>0)]

final_df.shape

(82146, 13)

In [16]:
X = final_df.drop('result',axis=1)
y = final_df['result']

groups = final_df['match_id']

gss = GroupShuffleSplit(test_size=0.2,n_splits=1,random_state=42)

train_idx, test_idx = next(gss.split(X,y,groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

X_train = X_train.drop('match_id',axis=1)
X_test  = X_test.drop('match_id',axis=1)

In [17]:
categorical = ['batting_team','bowling_team','city']
numeric = ['runs_left','balls_left','wickets','total_runs_x','crr','rrr','pressure','runs_per_wicket']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', SimpleImputer(strategy='constant', fill_value=0), numeric)
])

pipe = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=60,
        max_depth=10,
        n_jobs=-1,
        random_state=42
    ))
])

pipe.fit(X_train, y_train)

In [19]:
from sklearn.metrics import accuracy_score

pred = pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))

Accuracy: 0.796987478071502


In [20]:
import pickle

pickle.dump(pipe, open("pipe.pkl","wb"))