In [338]:
#libraries
import pickle
import pandas as pd
import numpy as np

In [339]:
#loading the file
df=pickle.load(open('output.pkl','rb'))

In [340]:
df.isnull().sum()

match_id               0
batting_team           0
bowling_team           0
ball                   0
runs                   0
player_dismissed       0
city                7046
venue                  0
dtype: int64

In [341]:
#each null in city column has value in respective venue column
df[df['city'].isnull()]['venue'].value_counts()

venue
Dubai International Cricket Stadium        3295
Melbourne Cricket Ground                   1203
Sydney Cricket Ground                       749
Pallekele International Cricket Stadium     741
Adelaide Oval                               373
Harare Sports Club                          372
Sharjah Cricket Stadium                     249
Carrara Oval                                 64
Name: count, dtype: int64

In [342]:
cities=np.where(df['city'].isnull(),df['venue'].str.split().apply(lambda x:x[0]),df['city'])

In [343]:
df['city']=cities

In [344]:
df.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [345]:
df.drop(columns=['venue'],inplace=True)

In [346]:
#filtering only city where max 5 matches were played
eligible_cities=df['city'].value_counts()[df['city'].value_counts()>600].index.to_list()

In [347]:
df=df[df['city'].isin(eligible_cities)]

In [348]:
#finding the current score
df['Current_score']=df.groupby('match_id')['runs'].cumsum()

In [349]:
#generating ball and over feature
df['ball_no']=df['ball'].apply(lambda x:str(x).split('.')[1])
df['over']=df['ball'].apply(lambda x:str(x).split('.')[0])

In [350]:
df['balls_bowled']=(df['over'].astype('int')*6)+(df['ball_no'].astype('int'))

In [351]:
df['balls_left']=120-df['balls_bowled']
df['balls_left']=df['balls_left'].apply(lambda x:0 if x<0 else x)

In [353]:
# wicket left
df['player_dismissed']=df['player_dismissed'].apply(lambda x:0 if x=='0' else 1)
df['player_dismissed']=df['player_dismissed'].astype('int')
df['player_dismissed']=df.groupby('match_id')['player_dismissed'].cumsum()
df['wickets_left']=10-df['player_dismissed']

In [354]:
#current run rate
df['curr']=(df['Current_score']*6)/df['balls_bowled']

In [355]:
#last 5 over run
groups=df.groupby('match_id')

match_ids=df['match_id'].unique()
last_five=[]
for id in match_ids:
    last_five.extend(groups.get_group(id).rolling(window=30)['runs'].sum().values.tolist())

In [356]:
df['last_five']=last_five

In [357]:
#final score
final_df=df.groupby('match_id')['runs'].sum().reset_index().merge(df,on='match_id')

In [359]:
final_df.head()

Unnamed: 0,match_id,runs_x,batting_team,bowling_team,ball,runs_y,player_dismissed,city,Current_score,ball_no,over,balls_bowled,balls_left,wickets_left,curr,last_five
0,22,195,New Zealand,Bangladesh,0.1,0,1,Mount Maunganui,0,1,0,1,119,9,0.0,
1,22,195,New Zealand,Bangladesh,0.2,3,1,Mount Maunganui,3,2,0,2,118,9,9.0,
2,22,195,New Zealand,Bangladesh,0.3,1,1,Mount Maunganui,4,3,0,3,117,9,8.0,
3,22,195,New Zealand,Bangladesh,0.4,0,1,Mount Maunganui,4,4,0,4,116,9,6.0,
4,22,195,New Zealand,Bangladesh,0.5,0,1,Mount Maunganui,4,5,0,5,115,9,4.8,


In [360]:
#keeping the right feature
final_df=final_df[['batting_team','bowling_team','city','Current_score','balls_left','wickets_left','curr','last_five','runs_x']]

In [361]:
final_df.isnull().sum()

batting_team        0
bowling_team        0
city                0
Current_score       0
balls_left          0
wickets_left        0
curr                0
last_five        9414
runs_x              0
dtype: int64

In [362]:
final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [363]:
final_df.isnull().sum()

batting_team     0
bowling_team     0
city             0
Current_score    0
balls_left       0
wickets_left     0
curr             0
last_five        0
runs_x           0
dtype: int64

In [364]:
#shuffling the data to avoid any bias in the data
final_df=final_df.sample(final_df.shape[0])

# Model Building

In [365]:
#libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import  RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

In [366]:
X=final_df.drop(columns=['runs_x'])
y=final_df['runs_x']

In [367]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=.2,random_state=42)

In [368]:
xtrain.shape

(23984, 8)

In [378]:
xtest

Unnamed: 0,batting_team,bowling_team,city,Current_score,balls_left,wickets_left,curr,last_five
26657,Pakistan,New Zealand,Pallekele,91,61,9,9.254237,50.0
9955,Pakistan,England,Manchester,37,90,8,7.400000,37.0
14853,New Zealand,England,Manchester,79,33,5,5.448276,27.0
7884,Australia,South Africa,Cape Town,154,20,7,9.240000,34.0
18107,Pakistan,Ireland,London,156,2,5,7.932203,42.0
...,...,...,...,...,...,...,...,...
26797,India,England,Colombo,91,48,8,7.583333,33.0
4285,India,South Africa,Johannesburg,156,30,6,10.400000,46.0
22077,Pakistan,England,Cardiff,49,82,9,7.736842,36.0
16484,Australia,West Indies,London,100,38,6,7.317073,46.0


In [383]:
trf=ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first'),['batting_team','bowling_team','city'])
],remainder='passthrough')


In [384]:
pipe=Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1000,learning_rate=.2,max_depth=12,random_state=1))
])

In [385]:
pipe.fit(xtrain,ytrain)
y_pred=pipe.predict(xtest)
print('r2_score:',r2_score(ytest,y_pred))
print('MAE:',mean_absolute_error(ytest,y_pred))


r2_score: 0.9875422306038281
MAE: 1.613898906545558


In [386]:
pickle.dump(pipe,open('pipe.pkl','wb'))