Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pickle.load(open('dataset2.pkl','rb'))

In [3]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground


In [4]:
df.isnull().sum()

match_id                0
batting_team            0
bowling_team            0
ball                    0
runs                    0
player_dismissed        0
city                10738
venue                   0
dtype: int64

Filling null values for city column

In [5]:
df[df['city'].isnull()]['venue'].value_counts()

Dubai International Cricket Stadium        4664
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1453
Sydney Cricket Ground                       749
Sharjah Cricket Stadium                     622
Adelaide Oval                               498
Harare Sports Club                          372
Sylhet International Cricket Stadium        128
Sylhet Stadium                              122
Carrara Oval                                 64
Name: venue, dtype: int64

In [6]:
cities = np.where(df['city'].isnull(),df['venue'].str.split().apply(lambda x:x[0]),df['city'])

In [7]:
df['city'] = cities

In [8]:
df.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [9]:
df.drop(columns=['venue'],inplace=True)

In [10]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne


Keeping data where at least 600 bowls are bowled at particular city.

In [11]:
eligible_cities = df['city'].value_counts()[df['city'].value_counts()>600].index.tolist()

In [12]:
df = df[df['city'].isin(eligible_cities)]

Fetching current score after particular delivery.

In [13]:
df['current_score'] = df.groupby('match_id').cumsum()['runs']

In [14]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3
...,...,...,...,...,...,...,...,...
121,967,Sri Lanka,Australia,19.3,1,0,Colombo,125
122,967,Sri Lanka,Australia,19.4,0,0,Colombo,125
123,967,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125
124,967,Sri Lanka,Australia,19.6,2,0,Colombo,127


Extracting balls left

In [15]:
df['over'] = df['ball'].apply(lambda x:str(x).split(".")[0])
df['ball_no'] = df['ball'].apply(lambda x:str(x).split(".")[1])
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5


In [16]:
df['balls_bowled'] = (df['over'].astype('int')*6)+df['ball_no'].astype('int')

In [17]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5


In [18]:
df['balls_left'] = 120 - df['balls_bowled']
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115


In [19]:
df['balls_left'] = df['balls_left'].apply(lambda x:0 if x<0 else x)
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115


In [20]:
df['player_dismissed'] = df['player_dismissed'].apply(lambda x:0 if x=='0' else 1)

Extracting wickets left

In [21]:
df['player_dismissed'] = df['player_dismissed'].astype('int')
df['player_dismissed'] = df.groupby('match_id').cumsum()['player_dismissed']
df['wickets_left'] = 10 - df['player_dismissed']

In [22]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10


Extracting current run rate

In [23]:
df['crr'] = (df['current_score']*6)/df['balls_bowled']

In [24]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,crr
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.0
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.0
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.0
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.5
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.6


Creating a column with runs scored in past five overs

In [25]:
groups = df.groupby('match_id')

match_ids = df['match_id'].unique()
last_five = []
for id in match_ids:
    last_five.extend(groups.get_group(id).rolling(window=30).sum()['runs'].values.tolist())

In [26]:
df['last_five'] = last_five

In [27]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,crr,last_five
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.0,
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.0,
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.0,
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.5,
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.6,


In [28]:
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df,on='match_id')

In [29]:
final_df=final_df[['batting_team','bowling_team','city','current_score','balls_left','wickets_left','crr','last_five','runs_x']]

In [30]:
final_df.dropna(inplace=True)

In [31]:
final_df.isnull().sum()

batting_team     0
bowling_team     0
city             0
current_score    0
balls_left       0
wickets_left     0
crr              0
last_five        0
runs_x           0
dtype: int64

In [32]:
final_df = final_df.sample(final_df.shape[0])

In [33]:
final_df.sample(5)

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
53669,Papua New Guinea,Afghanistan,Dublin,22,82,8,3.473684,16.0,127
39347,West Indies,England,Pallekele,128,29,7,8.43956,33.0,179
13340,New Zealand,India,Auckland,56,81,9,8.615385,42.0,132
52339,England,South Africa,Johannesburg,86,55,7,7.938462,39.0,171
50376,South Africa,Pakistan,Johannesburg,149,2,3,7.576271,42.0,153


Extracting the average runs scored by batting team in 1st inning

In [34]:
batting_avg = final_df.groupby('batting_team').mean()['runs_x']

In [35]:
batting_avg

batting_team
Afghanistan         149.678419
Australia           169.904298
Bangladesh          144.620940
England             162.773330
India               168.429323
Ireland             152.543655
Namibia             159.983784
Netherlands         142.260381
New Zealand         164.003926
Oman                146.365759
Pakistan            155.250306
Papua New Guinea    151.314879
Scotland            163.427095
South Africa        165.112497
Sri Lanka           158.062036
West Indies         152.268195
Name: runs_x, dtype: float64

In [36]:
team_average = pd.DataFrame(batting_avg)
team_average = team_average.rename({'batting_team':'team'},axis=1)
team_average = team_average.rename({'runs_x':'batting_average'},axis=1)
team_average.reset_index()

Unnamed: 0,batting_team,batting_average
0,Afghanistan,149.678419
1,Australia,169.904298
2,Bangladesh,144.62094
3,England,162.77333
4,India,168.429323
5,Ireland,152.543655
6,Namibia,159.983784
7,Netherlands,142.260381
8,New Zealand,164.003926
9,Oman,146.365759


In [37]:
final_df['batting_team_avg'] = final_df['batting_team'].map(batting_avg)

In [38]:
final_df.head()

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x,batting_team_avg
44271,West Indies,Bangladesh,Mirpur,82,57,8,7.809524,39.0,197,152.268195
52961,Pakistan,Bangladesh,Mirpur,74,49,8,6.253521,39.0,141,155.250306
45292,West Indies,England,Barbados,121,26,7,7.723404,36.0,170,152.268195
47313,New Zealand,Bangladesh,Mirpur,153,26,8,9.765957,59.0,204,164.003926
28590,Sri Lanka,India,Colombo,65,73,9,8.297872,45.0,171,158.062036


Extracting the average runs conceded by bowling team in 1st inning

In [39]:
bowling_avg = final_df.groupby('bowling_team').mean()['runs_x']

In [40]:
bowling_avg

bowling_team
Afghanistan         152.238532
Australia           154.535440
Bangladesh          165.322311
England             166.816068
India               156.053497
Ireland             173.334354
Namibia             158.209790
Netherlands         153.750733
New Zealand         163.112506
Oman                154.879908
Pakistan            149.291915
Papua New Guinea    135.893617
Scotland            166.027641
South Africa        158.242265
Sri Lanka           157.975761
West Indies         166.808527
Name: runs_x, dtype: float64

In [41]:
bowling_average = pd.DataFrame(bowling_avg)
bowling_average = bowling_average.rename({'bowling_team':'Bowling_team'},axis=1)
bowling_average = bowling_average.rename({'runs_x':'Average_runs'},axis=1)
bowling_average.reset_index()

Unnamed: 0,bowling_team,Average_runs
0,Afghanistan,152.238532
1,Australia,154.53544
2,Bangladesh,165.322311
3,England,166.816068
4,India,156.053497
5,Ireland,173.334354
6,Namibia,158.20979
7,Netherlands,153.750733
8,New Zealand,163.112506
9,Oman,154.879908


In [42]:
team_average['bowling_average'] = bowling_average['Average_runs']
team_average = team_average.reset_index()

In [43]:
team_average

Unnamed: 0,batting_team,batting_average,bowling_average
0,Afghanistan,149.678419,152.238532
1,Australia,169.904298,154.53544
2,Bangladesh,144.62094,165.322311
3,England,162.77333,166.816068
4,India,168.429323,156.053497
5,Ireland,152.543655,173.334354
6,Namibia,159.983784,158.20979
7,Netherlands,142.260381,153.750733
8,New Zealand,164.003926,163.112506
9,Oman,146.365759,154.879908


In [44]:
final_df['bowling_team_avg'] = final_df['bowling_team'].map(bowling_avg)

In [45]:
final_df.head()

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x,batting_team_avg,bowling_team_avg
44271,West Indies,Bangladesh,Mirpur,82,57,8,7.809524,39.0,197,152.268195,165.322311
52961,Pakistan,Bangladesh,Mirpur,74,49,8,6.253521,39.0,141,155.250306,165.322311
45292,West Indies,England,Barbados,121,26,7,7.723404,36.0,170,152.268195,166.816068
47313,New Zealand,Bangladesh,Mirpur,153,26,8,9.765957,59.0,204,164.003926,165.322311
28590,Sri Lanka,India,Colombo,65,73,9,8.297872,45.0,171,158.062036,156.053497


Extracting the average runs scored at particular city in 1st inning

In [46]:
city_avg = final_df.groupby('city').mean()['runs_x']

In [47]:
city_avg

city
Abu Dhabi          142.595858
Adelaide           182.060669
Al Amarat          152.890561
Auckland           171.453148
Bangalore          152.227368
Barbados           155.755299
Cape Town          154.403132
Cardiff            150.910072
Centurion          187.890547
Chandigarh         178.157676
Chittagong         152.019309
Christchurch       181.255864
Colombo            157.188577
Delhi              156.141593
Dubai              145.754681
Dublin             173.776256
Durban             165.254443
Edinburgh          186.486373
Greater Noida      180.482394
Hamilton           175.440258
Johannesburg       170.568219
Kolkata            143.491420
Lahore             158.369338
Lauderhill         161.362869
London             160.360038
Manchester         157.770492
Melbourne          140.666667
Mirpur             151.213574
Mount Maunganui    195.978261
Mumbai             190.973788
Nagpur             153.436170
Nottingham         155.045809
Pallekele          171.951685
Sharj

In [48]:
city_average = pd.DataFrame(city_avg)
city_average = city_average.rename({'city':'city'},axis=1)
city_average = city_average.rename({'runs_x':'Average_runs'},axis=1)
city_average = city_average.reset_index()

In [49]:
team_average.to_csv('team_average.csv',index=False)
city_average.to_csv('city_average.csv',index=False)

In [50]:
final_df['city_avg'] = final_df['city'].map(city_avg)

In [51]:
final_df.head()

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x,batting_team_avg,bowling_team_avg,city_avg
44271,West Indies,Bangladesh,Mirpur,82,57,8,7.809524,39.0,197,152.268195,165.322311,151.213574
52961,Pakistan,Bangladesh,Mirpur,74,49,8,6.253521,39.0,141,155.250306,165.322311,151.213574
45292,West Indies,England,Barbados,121,26,7,7.723404,36.0,170,152.268195,166.816068,155.755299
47313,New Zealand,Bangladesh,Mirpur,153,26,8,9.765957,59.0,204,164.003926,165.322311,151.213574
28590,Sri Lanka,India,Colombo,65,73,9,8.297872,45.0,171,158.062036,156.053497,157.188577


In [52]:
final_df.to_csv('FinalDataset.csv',index=False)

In [53]:
final_df = pd.read_csv('FinalDataset.csv')

In [54]:
X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']

Splitting data

In [55]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=101)

In [56]:
x_train.head()

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,batting_team_avg,bowling_team_avg,city_avg
2131,Sri Lanka,Pakistan,Abu Dhabi,42,86,10,7.411765,41.0,158.062036,149.291915,142.595858
9791,Oman,Scotland,Dubai,27,89,8,5.225806,26.0,146.365759,166.027641,145.754681
23976,Netherlands,New Zealand,Chittagong,48,69,8,5.647059,27.0,142.260381,163.112506,152.019309
9693,West Indies,India,Lauderhill,111,28,3,7.23913,33.0,152.268195,156.053497,161.362869
5025,South Africa,Sri Lanka,Johannesburg,102,44,9,8.052632,52.0,165.112497,157.975761,170.568219


In [57]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

Model Training

In [58]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [59]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',DecisionTreeRegressor())
])

In [60]:
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.9720582783221907
0.8997181266261925


In [61]:
pickle.dump(pipe,open('pipe.pkl','wb'))

In [62]:
team_avg = pd.read_csv('team_average.csv')

In [63]:
team_avg.rename(columns={'batting_team':'team'},inplace=True)

In [64]:
team_avg.to_csv('team_average.csv',index=False)

In [65]:
model = pickle.load(open('pipe.pkl','rb'))

In [66]:
team_avg = pd.read_csv('team_average.csv')
city_avg = pd.read_csv('city_average.csv')

In [67]:
team = list(team_avg['team'])
bat_avg = list(team_avg['batting_average'])
bowl_avg = list(team_avg['bowling_average'])
cities = list(city_avg['city'])
c_avg = list(city_avg['Average_runs'])

In [68]:
model.predict(x_test[:5])

array([152., 177., 118., 206., 194.])

In [69]:
y_test[:5]

35331    152
21756    177
34164    118
22969    206
21763    194
Name: runs_x, dtype: int64

In [70]:
x_test[:5]

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,batting_team_avg,bowling_team_avg,city_avg
35331,Sri Lanka,India,Colombo,109,53,7,9.761194,54.0,158.062036,156.053497,157.188577
21756,New Zealand,South Africa,Centurion,55,82,9,8.684211,48.0,164.003926,158.242265,187.890547
34164,New Zealand,Australia,Wellington,66,43,6,5.142857,30.0,164.003926,154.53544,156.109602
22969,Sri Lanka,India,Chandigarh,192,4,3,9.931034,37.0,158.062036,156.053497,178.157676
21763,Australia,India,Sydney,78,69,8,9.176471,36.0,169.904298,156.053497,164.888519


In [71]:
batting_team = 'Afghanistan'
bowling_team = 'Bangladesh'
city = 'Dublin'
current_score = 59
overs = "9"
try:
    b = int(overs.split(".")[1])
except:
    b = 0
balls = int(overs.split(".")[0])*6+b
balls_left = 120 - balls
wickets = 2
wickets_left = 10 - wickets
crr = current_score*6/balls
last_five = 25
batting_team_avg = bat_avg[team.index(batting_team)]
bowling_team_avg = bowl_avg[team.index(bowling_team)]
city_avg = c_avg[cities.index(city)]
data = pd.DataFrame({'batting_team':[batting_team], 'bowling_team':[bowling_team], 'city':[city],
                    'current_score':[current_score], 'balls_left':[balls_left],'wickets_left':[wickets_left],
                     'crr':[crr],'last_five':[last_five], 'batting_team_avg':[batting_team_avg],
                     'bowling_team_avg':[bowling_team_avg], 'city_avg':[city_avg]})
predicted = int(model.predict(data))
runs_c = current_score+int(crr*balls_left/6)
runs_6 = current_score+int(6*balls_left/6)
runs_8 = current_score+int(8*balls_left/6)
runs_10 = current_score+int(10*balls_left/6)
runs_12 = current_score+int(12*balls_left/6)

print(f"Projected Score:-{predicted}\nBy CRR:-{runs_c}\n6 RPO:-{runs_6}\n8 RPO:-{runs_8}\n10 RPO:-{runs_10}\n12 RPO:-{runs_12}")


Projected Score:-123
By CRR:-131
6 RPO:-125
8 RPO:-147
10 RPO:-169
12 RPO:-191
