In [2]:
# Import dependancies
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)


In [3]:
deliveries_df = pd.read_csv('./deliveries.csv')
matches_df = pd.read_csv('./matches.csv')

In [4]:
deliveries_df.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,4,0,4,,,


In [5]:
matches_df.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [6]:
# taking the total score of the first inning
total_runs_x = (deliveries_df.groupby(['match_id','inning'])['total_runs']
                .sum()
                .reset_index())

total_runs_x = total_runs_x.query('inning==1')

In [7]:
# Merge the data with match data
matches_df = matches_df.merge(total_runs_x,left_on='id',right_on='match_id')

In [8]:
# take only the data where dl is not applied
matches = matches_df[matches_df['dl_applied'] == 0]

In [9]:
# Extracting necassary columns from the matches data
matches = matches[['match_id',
          'city',
          'team1',
          'team2',
          'winner',
          'total_runs']]

In [10]:
# Keep only teems that are currently playing in the ipl
matches['team1'].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants',
       'Delhi Capitals'], dtype=object)

In [11]:
current_teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Kings XI Punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals'
]

In [12]:
# replace both column team1 and team2 in matches,the value delhi capitals into delhi dare devils
matches['team1'] = matches['team1'].replace('Delhi Daredevils', 'Delhi Capitals')
matches['team2'] = matches['team2'].replace('Delhi Daredevils', 'Delhi Capitals')

# Same for Deccan Charges into Sunrise Hydrabad
matches['team1'] = matches['team1'].replace('Deccan Chargers', 'Sunrisers Hyderabad')
matches['team2'] = matches['team2'].replace('Deccan Chargers', 'Sunrisers Hyderabad')

In [13]:
matches = matches[matches['team1'].isin(current_teams)]
matches = matches[matches['team2'].isin(current_teams)]

In [14]:
# check shape of the mathches data
matches.shape


(626, 6)

In [15]:
# Since we already have batting and bowling team in deliveries, drop it from matches
matches = matches.drop(columns=['team2', 'team1'])

In [16]:
# merge this data with deliveries using match_id
deliveries = deliveries_df.merge(matches, on='match_id')

In [17]:
# taking only the second inning ,because we have to chase the total score made in first inning
deliveries = deliveries.query('inning==2')

In [18]:
# check the shape deliveries data
print(deliveries.shape)

(72413, 24)


In [19]:
deliveries['current_score'] = deliveries.groupby('match_id')['total_runs_x'].cumsum()

In [20]:
# runs_left colums
deliveries['runs_left'] = deliveries['total_runs_y'] - deliveries['current_score']

In [21]:
deliveries['ball_left'] = 126 - (deliveries['over']*6 +deliveries['ball'])

In [22]:
deliveries.head(2)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs_x,player_dismissed,dismissal_kind,fielder,city,winner,total_runs_y,current_score,runs_left,ball_left
125,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,Mandeep Singh,A Nehra,0,0,0,0,0,0,1,0,1,,,,Hyderabad,Sunrisers Hyderabad,207,1,206,119
126,1,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,CH Gayle,A Nehra,0,0,0,0,0,0,0,0,0,,,,Hyderabad,Sunrisers Hyderabad,207,1,206,118


In [23]:
deliveries['is_wicket'] = (~(pd.isna(deliveries['player_dismissed'])))

In [24]:
deliveries['is_wicket'] = deliveries['is_wicket'].astype(int)

In [25]:
deliveries['current_total_wickets'] = deliveries.groupby('match_id')['is_wicket'].cumsum()

In [26]:
deliveries['wickets_left'] = 10 - deliveries['current_total_wickets']

In [27]:
deliveries.apply(lambda x:1 if x.batting_team==x.winner else 0,axis=1)

125       0
126       0
127       0
128       0
129       0
         ..
149573    0
149574    0
149575    0
149576    0
149577    0
Length: 72413, dtype: int64

In [28]:
deliveries['crr'] = deliveries['current_score']*6/(120 - deliveries['ball_left'])

In [29]:
deliveries['rrr'] = deliveries['runs_left']*6/deliveries['ball_left']

In [30]:
# removing unneccassary columns
deliveries = deliveries[['batting_team', 'bowling_team','over','city', 'winner', 'total_runs_y',
       'current_score', 'runs_left', 'ball_left','wickets_left', 'crr', 'rrr']]

In [35]:
deliveries['winner'] = (deliveries['batting_team'] == deliveries['winner']).astype(int)

In [38]:
final_df = deliveries.sample(deliveries.shape[0])

In [40]:
0/100

0.0

In [44]:
final_df = final_df.query('ball_left!=0')

In [46]:
final_df.describe()

Unnamed: 0,over,winner,total_runs_y,current_score,runs_left,ball_left,wickets_left,crr,rrr
count,72172.0,72172.0,72172.0,72172.0,72172.0,72172.0,72172.0,72172.0,72172.0
mean,9.922186,0.526991,165.542648,73.040902,92.501746,62.866305,7.551377,7.43789,10.349743
std,5.54274,0.499274,29.286095,46.703971,49.916667,33.263185,2.123458,2.278411,13.659793
min,1.0,0.0,65.0,0.0,-16.0,-2.0,0.0,0.0,-510.0
25%,5.0,0.0,146.0,34.0,53.0,35.0,6.0,6.25,7.135135
50%,10.0,1.0,165.0,69.0,92.0,64.0,8.0,7.478261,8.869565
75%,15.0,1.0,185.0,108.0,130.0,92.0,9.0,8.682353,10.879341
max,20.0,1.0,250.0,222.0,249.0,119.0,10.0,42.0,678.0


In [47]:
final_df

Unnamed: 0,batting_team,bowling_team,over,city,winner,total_runs_y,current_score,runs_left,ball_left,wickets_left,crr,rrr
72190,Kolkata Knight Riders,Chennai Super Kings,7,Chennai,1,190,58,132,81,9,8.923077,9.777778
43769,Royal Challengers Bangalore,Kolkata Knight Riders,16,Bangalore,1,160,137,23,25,7,8.652632,5.520000
3297,Delhi Daredevils,Mumbai Indians,5,Mumbai,0,142,21,121,90,6,4.200000,8.066667
12962,Kings XI Punjab,Royal Challengers Bangalore,18,Bangalore,1,126,119,7,14,6,6.735849,3.000000
101361,Kings XI Punjab,Rajasthan Royals,3,Pune,0,162,28,134,104,9,10.500000,7.730769
...,...,...,...,...,...,...,...,...,...,...,...,...
43247,Kings XI Punjab,Mumbai Indians,11,Chandigarh,1,154,89,65,57,8,8.476190,6.842105
12687,Rajasthan Royals,Chennai Super Kings,8,Jaipur,1,109,66,43,73,10,8.425532,3.534247
136602,Kings XI Punjab,Kolkata Knight Riders,5,Kolkata,0,223,39,184,95,9,9.360000,11.621053
127968,Kolkata Knight Riders,Royal Challengers Bangalore,9,Bengaluru,1,185,73,112,68,9,8.423077,9.882353


In [67]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [50]:
X = final_df.drop(columns='winner')
y = final_df['winner']

In [51]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,stratify=final_df['winner'])

In [65]:
clf = ColumnTransformer([
    ('onehot',OneHotEncoder(drop='first',sparse_output=False),[0,1,3])
    ],remainder=StandardScaler()).set_output(transform='pandas')

In [52]:
X_train.shape,y_train.shape

((57737, 11), (57737,))

In [73]:
model = BaggingClassifier(LogisticRegression(),n_estimators=5,max_samples=.5)

In [74]:
pipe = Pipeline([
    ('trf',clf),
    ('model',model)
    ])

In [75]:
cross_val_score(pipe,X_train,y_train,cv=5)

array([0.80602702, 0.80161067, 0.80271932, 0.80436477, 0.80271932])