# Feature Extraction

The goal of the following code will be to use the data extracted to compute the values needed to serve as decision factors for our model.

## 1. 1st Innings

In [1]:
# Start by importing the relevant libraries.
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [4]:
# load delivery dataframe from dataset exported under data_extraction
df = pickle.load(open('dataset_level2_first_innings.pkl', 'rb'))

From here on, we will work on modifying this DataFrame to contain data for the 8 features we want to use for the developement of our model. These features are as follows: batting team, bowling team, current score, wickets left, current run rate, city, balls left, last five overs (runs scored).

In [5]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,1,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,1,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,1,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,1,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,1,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


2/8 features have been extracted.

In [6]:
# Start by focusing on the city column.
# Count the number of NaN values in the city column.
df['city'].isnull().sum()

12360

In [7]:
# For the line items with NaN city values, look at the venue values.
df[df['city'].isnull()]['venue'].value_counts()

venue
Dubai International Cricket Stadium        3425
Harare Sports Club                         2731
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1453
Sharjah Cricket Stadium                     757
Sydney Cricket Ground                       749
Adelaide Oval                               498
Rawalpindi Cricket Stadium                  368
Sylhet International Cricket Stadium        128
Sylhet Stadium                              121
Carrara Oval                                 64
Name: count, dtype: int64

In [8]:
# The output above indicates that for NaN values in the city column, we can use the first word from the values 
# of the venue column.
# We create a condition using np.where() to assign values where city column value is NaN
# These values are outputted to an array called cities that can then update the DataFrame.
cities = np.where(df['city'].isnull(), df['venue'].str.split().apply(lambda x:x[0]), df['city'])
df['city'] = cities

In [9]:
# Check all columns for any NaN values.
df.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [10]:
# Remove the Venue column since it is not needed anymore = not a feature, redundant with city.
df.drop(columns = ['venue'], inplace = True)

In [11]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo


In [12]:
# For city column, we also want to trim the data to include cities for which enough data 
# is available to optimally train the model. We define a conditon of atlest 5 matches.
eligible_cities = df['city'].value_counts()[df['city'].value_counts() > 600].index.tolist()
df = df[df['city'].isin(eligible_cities)]

In [13]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo


3/8 features have been extracted.

In [14]:
# Now we focus on the current score feature. Currently, our dataframe only has runs conceded
# for every delivery.
# We use groupby here which is an operation to split a python object, apply a funciton and combine
# the results in another python object that we can then run operations on.
df.loc[:,'current_score'] = df.groupby('match_id')['runs'].cumsum()

In [15]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,0
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,0
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,1
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,3
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,3
...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,125
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,125
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,127


4/8 features have been extracted.

In [16]:
# Now we focus on the balls left feature.
# Start by splitting the data in 'ball' column into over and ball no.
overs = df['ball'].apply(lambda x:str(x).split(".")[0])
ball_no = df['ball'].apply(lambda x:str(x).split(".")[1])
# We can use this computed data to now compute the balls left for every delivery line item.
balls_bowled = (overs.astype('int')*6) + ball_no.astype('int') # No need to account for extras.
balls_left = 120 - balls_bowled
# Add this to the DataFrame
df.loc[:, 'balls_left'] = balls_left.apply(lambda x:0 if x<0 else x)
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,balls_left
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,0,119
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,0,118
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,1,117
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,3,116
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,3,115
...,...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,125,3
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,125,2
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,1
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,127,0


5/8 features have been extracted.

In [17]:
# Now we find the wickets left feature from the data.
# Start by creating a list of 1 and 0 to represent whether a wicket was taken on a ball
# Then group data by matches and find the sum
df.loc[:, 'wickets_left'] = df['player_dismissed'].apply(lambda x:0 if x == '0' else 1)
df.loc[:, 'wickets_left'] = df['wickets_left'].astype('int')
df.loc[:, 'wickets_left'] = df.groupby('match_id')['wickets_left'].cumsum()
df.loc[:, 'wickets_left'] = 10 - df['wickets_left']
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,balls_left,wickets_left
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,0,119,10
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,0,118,10
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,1,117,10
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,3,116,10
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,3,115,10
...,...,...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,125,3,2
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,125,2,2
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,1,1
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,127,0,1


6/8 features extracted.

In [18]:
# Now we focus on the current run rate feature.
df.loc[:, 'crr'] = (df['current_score']*6)/balls_bowled
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,balls_left,wickets_left,crr
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,0,119,10,0.000000
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,0,118,10,0.000000
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,1,117,10,2.000000
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,3,116,10,4.500000
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,3,115,10,3.600000
...,...,...,...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,125,3,2,6.410256
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,125,2,2,6.355932
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,1,1,6.302521
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,127,0,1,6.350000


7/8 features extracted.

In [19]:
# The last feature is runs scored in last 5 overs.
# Group the delivery by matches.
groups = df.groupby('match_id')
# Get a list of all match ids to iterate through.
match_ids = df['match_id'].unique()
# Create a container to serve as a temp buffer for the values of last 5 overs.
last_five = []
# Iterate through each match
for id in match_ids:
    # Compute the values using a rolling window.
    last_five.extend(groups.get_group(id)['runs'].rolling(window = 30).sum().values.tolist())

In [20]:
# Add these values to the DataFrame
df.loc[:,'last_five'] = last_five

In [21]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,balls_left,wickets_left,crr,last_five
0,1,Australia,Sri Lanka,0.1,0,0,Melbourne,0,119,10,0.000000,
1,1,Australia,Sri Lanka,0.2,0,0,Melbourne,0,118,10,0.000000,
2,1,Australia,Sri Lanka,0.3,1,0,Melbourne,1,117,10,2.000000,
3,1,Australia,Sri Lanka,0.4,2,0,Melbourne,3,116,10,4.500000,
4,1,Australia,Sri Lanka,0.5,0,0,Melbourne,3,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...
115320,963,Sri Lanka,Australia,19.3,1,0,Colombo,125,3,2,6.410256,32.0
115321,963,Sri Lanka,Australia,19.4,0,0,Colombo,125,2,2,6.355932,32.0
115322,963,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,1,1,6.302521,32.0
115323,963,Sri Lanka,Australia,19.6,2,0,Colombo,127,0,1,6.350000,33.0


8/8 features have been extracted.

In [22]:
# Now we move to the output of the model - the final score.
final_df = df.groupby('match_id')['runs'].sum().reset_index().merge(df, on = 'match_id')

In [23]:
final_df

Unnamed: 0,match_id,runs_x,batting_team,bowling_team,ball,runs_y,player_dismissed,city,current_score,balls_left,wickets_left,crr,last_five
0,1,168,Australia,Sri Lanka,0.1,0,0,Melbourne,0,119,10,0.000000,
1,1,168,Australia,Sri Lanka,0.2,0,0,Melbourne,0,118,10,0.000000,
2,1,168,Australia,Sri Lanka,0.3,1,0,Melbourne,1,117,10,2.000000,
3,1,168,Australia,Sri Lanka,0.4,2,0,Melbourne,3,116,10,4.500000,
4,1,168,Australia,Sri Lanka,0.5,0,0,Melbourne,3,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60013,963,128,Sri Lanka,Australia,19.3,1,0,Colombo,125,3,2,6.410256,32.0
60014,963,128,Sri Lanka,Australia,19.4,0,0,Colombo,125,2,2,6.355932,32.0
60015,963,128,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,1,1,6.302521,32.0
60016,963,128,Sri Lanka,Australia,19.6,2,0,Colombo,127,0,1,6.350000,33.0


In [24]:
# Further clean up final DataFrame to include only relevant features.
final_df = final_df[['batting_team', 'bowling_team', 'city', 'current_score', 'balls_left', 'wickets_left', 'crr', 'last_five', 'runs_x']]

In [25]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
0,Australia,Sri Lanka,Melbourne,0,119,10,0.000000,,168
1,Australia,Sri Lanka,Melbourne,0,118,10,0.000000,,168
2,Australia,Sri Lanka,Melbourne,1,117,10,2.000000,,168
3,Australia,Sri Lanka,Melbourne,3,116,10,4.500000,,168
4,Australia,Sri Lanka,Melbourne,3,115,10,3.600000,,168
...,...,...,...,...,...,...,...,...,...
60013,Sri Lanka,Australia,Colombo,125,3,2,6.410256,32.0,128
60014,Sri Lanka,Australia,Colombo,125,2,2,6.355932,32.0,128
60015,Sri Lanka,Australia,Colombo,125,1,1,6.302521,32.0,128
60016,Sri Lanka,Australia,Colombo,127,0,1,6.350000,33.0,128


In [26]:
# Drop rows with NaN values in the last_five column.
final_df.dropna(inplace = True)

In [27]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
29,Australia,Sri Lanka,Melbourne,43,90,10,8.600000,43.0,168
30,Australia,Sri Lanka,Melbourne,44,89,10,8.516129,44.0,168
31,Australia,Sri Lanka,Melbourne,45,88,10,8.437500,45.0,168
32,Australia,Sri Lanka,Melbourne,45,87,10,8.181818,44.0,168
33,Australia,Sri Lanka,Melbourne,45,86,10,7.941176,42.0,168
...,...,...,...,...,...,...,...,...,...
60013,Sri Lanka,Australia,Colombo,125,3,2,6.410256,32.0,128
60014,Sri Lanka,Australia,Colombo,125,2,2,6.355932,32.0,128
60015,Sri Lanka,Australia,Colombo,125,1,1,6.302521,32.0,128
60016,Sri Lanka,Australia,Colombo,127,0,1,6.350000,33.0,128


In [28]:
# Shuffle the data to reduce chances of a bias.
final_df = final_df.sample(final_df.shape[0])

In [29]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
17575,England,Australia,Southampton,117,33,5,8.068966,28.0,179
19251,England,Zimbabwe,Cape Town,51,89,8,9.870968,49.0,188
14747,Australia,England,Southampton,109,25,5,6.884211,40.0,157
25818,South Africa,India,Nottingham,44,91,9,9.103448,44.0,130
54916,Sri Lanka,West Indies,Pallekele,136,28,8,8.869565,40.0,215
...,...,...,...,...,...,...,...,...,...
23629,India,Bangladesh,Nottingham,67,68,9,7.730769,35.0,180
57548,Australia,Pakistan,Chandigarh,106,41,7,8.050633,42.0,193
24728,Sri Lanka,Pakistan,London,73,78,10,10.428571,47.0,150
10773,New Zealand,Sri Lanka,Auckland,128,21,4,7.757576,52.0,179


In [30]:
# Pickle dump the final DataFrame
import pickle
pickle.dump(final_df, open('dataset_level3_first_innings.pkl','wb'))

## 2. 2nd Innings

We will work on curating the same features as 1st Innings. The main difference will be the usage of data from the 2nd Innings and the output will be the match outcome, from the batting team's perspective instead of total runs computed for the 1st Innings.

In [2]:
# Load the dataframe exported after data extraction for 2nd innings.
df2 = pickle.load(open('dataset_level2_second_innings.pkl', 'rb'))

In [3]:
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,winner
0,1,Sri Lanka,Australia,0.1,1,0,,Melbourne Cricket Ground,Sri Lanka
1,1,Sri Lanka,Australia,0.2,1,0,,Melbourne Cricket Ground,Sri Lanka
2,1,Sri Lanka,Australia,0.3,0,0,,Melbourne Cricket Ground,Sri Lanka
3,1,Sri Lanka,Australia,0.4,0,0,,Melbourne Cricket Ground,Sri Lanka
4,1,Sri Lanka,Australia,0.5,3,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,R Premadasa Stadium,Australia
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,R Premadasa Stadium,Australia
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,R Premadasa Stadium,Australia
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,R Premadasa Stadium,Australia


We start with curating the features in the same manner as 1st innings.
2/8 features extracted so far.

In [4]:
# Start by focusing on the city column.
# Count the number of NaN values in the city column.
df2['city'].isnull().sum()

11265

In [5]:
# For the line items with NaN city values, look at the venue values.
df2[df2['city'].isnull()]['venue'].value_counts()

venue
Dubai International Cricket Stadium        3250
Harare Sports Club                         2494
Pallekele International Cricket Stadium    1926
Melbourne Cricket Ground                   1153
Sharjah Cricket Stadium                     704
Sydney Cricket Ground                       643
Adelaide Oval                               480
Rawalpindi Cricket Stadium                  307
Sylhet Stadium                              123
Sylhet International Cricket Stadium        121
Carrara Oval                                 64
Name: count, dtype: int64

In [6]:
# The output above indicates that for NaN values in the city column, we can use the first word from the values 
# of the venue column.
# We create a condition using np.where() to assign values where city column value is NaN
# These values are outputted to an array called cities that can then update the DataFrame.
cities = np.where(df2['city'].isnull(), df2['venue'].str.split().apply(lambda x:x[0]), df2['city'])
df2['city'] = cities

In [7]:
# Check for NaN values in the city column.
df2['city'].isnull().sum()

0

In [8]:
# Remove the Venue column since it is not needed anymore - not a feature, redundant with city.
df2.drop(columns = ['venue'], inplace = True)

In [9]:
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner
0,1,Sri Lanka,Australia,0.1,1,0,Melbourne,Sri Lanka
1,1,Sri Lanka,Australia,0.2,1,0,Melbourne,Sri Lanka
2,1,Sri Lanka,Australia,0.3,0,0,Melbourne,Sri Lanka
3,1,Sri Lanka,Australia,0.4,0,0,Melbourne,Sri Lanka
4,1,Sri Lanka,Australia,0.5,3,0,Melbourne,Sri Lanka
...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,Australia
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,Australia
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,Australia
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,Australia


3/8 features have been extracted.

In [10]:
# Now we focus on the current score feature. Currently, our dataframe only has runs conceded
# for every delivery.
# We use groupby here which is an operation to split a python object, apply a funciton and combine
# the results in another python object that we can then run operations on.
df2.loc[:,'current_score'] = df2.groupby('match_id')['runs'].cumsum()

In [11]:
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner,current_score
0,1,Sri Lanka,Australia,0.1,1,0,Melbourne,Sri Lanka,1
1,1,Sri Lanka,Australia,0.2,1,0,Melbourne,Sri Lanka,2
2,1,Sri Lanka,Australia,0.3,0,0,Melbourne,Sri Lanka,2
3,1,Sri Lanka,Australia,0.4,0,0,Melbourne,Sri Lanka,2
4,1,Sri Lanka,Australia,0.5,3,0,Melbourne,Sri Lanka,5
...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,Australia,121
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,Australia,124
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,Australia,124
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,Australia,124


4/8 features have been extracted.

In [12]:
# Now we focus on the balls left feature.
# Start by splitting the data in 'ball' column into over and ball no.
overs = df2['ball'].apply(lambda x:str(x).split(".")[0])
ball_no = df2['ball'].apply(lambda x:str(x).split(".")[1])
# We can use this computed data to now compute the balls left for every delivery line item.
balls_bowled = (overs.astype('int')*6) + ball_no.astype('int') # No need to account for extras.
balls_left = 120 - balls_bowled
# Add this to the DataFrame
df2.loc[:, 'balls_left'] = balls_left.apply(lambda x:0 if x<0 else x)
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner,current_score,balls_left
0,1,Sri Lanka,Australia,0.1,1,0,Melbourne,Sri Lanka,1,119
1,1,Sri Lanka,Australia,0.2,1,0,Melbourne,Sri Lanka,2,118
2,1,Sri Lanka,Australia,0.3,0,0,Melbourne,Sri Lanka,2,117
3,1,Sri Lanka,Australia,0.4,0,0,Melbourne,Sri Lanka,2,116
4,1,Sri Lanka,Australia,0.5,3,0,Melbourne,Sri Lanka,5,115
...,...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,Australia,121,17
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,Australia,124,16
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,Australia,124,15
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,Australia,124,14


5/8 features have been extracted.

In [13]:
# Now we find the wickets left feature from the data.
# Start by creating a list of 1 and 0 to represent whether a wicket was taken on a ball
# Then group data by matches and find the sum
df2.loc[:, 'wickets_left'] = df2['player_dismissed'].apply(lambda x:0 if x == '0' else 1)
df2.loc[:, 'wickets_left'] = df2['wickets_left'].astype('int')
df2.loc[:, 'wickets_left'] = df2.groupby('match_id')['wickets_left'].cumsum()
df2.loc[:, 'wickets_left'] = 10 - df2['wickets_left']
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner,current_score,balls_left,wickets_left
0,1,Sri Lanka,Australia,0.1,1,0,Melbourne,Sri Lanka,1,119,10
1,1,Sri Lanka,Australia,0.2,1,0,Melbourne,Sri Lanka,2,118,10
2,1,Sri Lanka,Australia,0.3,0,0,Melbourne,Sri Lanka,2,117,10
3,1,Sri Lanka,Australia,0.4,0,0,Melbourne,Sri Lanka,2,116,10
4,1,Sri Lanka,Australia,0.5,3,0,Melbourne,Sri Lanka,5,115,10
...,...,...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,Australia,121,17,4
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,Australia,124,16,4
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,Australia,124,15,4
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,Australia,124,14,4


6/8 features have been extracted.

In [14]:
# Now we focus on the current run rate feature.
df2.loc[:, 'crr'] = (df2['current_score']*6)/balls_bowled
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner,current_score,balls_left,wickets_left,crr
0,1,Sri Lanka,Australia,0.1,1,0,Melbourne,Sri Lanka,1,119,10,6.000000
1,1,Sri Lanka,Australia,0.2,1,0,Melbourne,Sri Lanka,2,118,10,6.000000
2,1,Sri Lanka,Australia,0.3,0,0,Melbourne,Sri Lanka,2,117,10,4.000000
3,1,Sri Lanka,Australia,0.4,0,0,Melbourne,Sri Lanka,2,116,10,3.000000
4,1,Sri Lanka,Australia,0.5,3,0,Melbourne,Sri Lanka,5,115,10,6.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,Australia,121,17,4,7.048544
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,Australia,124,16,4,7.153846
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,Australia,124,15,4,7.085714
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,Australia,124,14,4,7.018868


7/8 features have been extracted.

In [16]:
# The last feature is runs scored in last 5 overs.
# Group the delivery by matches.
groups = df2.groupby('match_id')
# Get a list of all match ids to iterate through.
match_ids = df2['match_id'].unique()
# Create a container to serve as a temp buffer for the values of last 5 overs.
last_five = []
# Iterate through each match
for id in match_ids:
    # Compute the values using a rolling window.
    last_five.extend(groups.get_group(id)['runs'].rolling(window = 30).sum().values.tolist())

In [17]:
# Add these values to the DataFrame
df2.loc[:,'last_five'] = last_five

In [18]:
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner,current_score,balls_left,wickets_left,crr,last_five
0,1,Sri Lanka,Australia,0.1,1,0,Melbourne,Sri Lanka,1,119,10,6.000000,
1,1,Sri Lanka,Australia,0.2,1,0,Melbourne,Sri Lanka,2,118,10,6.000000,
2,1,Sri Lanka,Australia,0.3,0,0,Melbourne,Sri Lanka,2,117,10,4.000000,
3,1,Sri Lanka,Australia,0.4,0,0,Melbourne,Sri Lanka,2,116,10,3.000000,
4,1,Sri Lanka,Australia,0.5,3,0,Melbourne,Sri Lanka,5,115,10,6.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,Australia,121,17,4,7.048544,18.0
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,Australia,124,16,4,7.153846,21.0
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,Australia,124,15,4,7.085714,21.0
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,Australia,124,14,4,7.018868,21.0


8/8 features have been extracted.

In [22]:
# Now we focus on the output of the model - the winner of the match.
# To simply processing for the model, we will use 1 or 0 values for the 'winner' column
# 1 if the winner is the batting team.
 
# Start by removing the data for matches where a winner does not exist.
# We also take care of the rows with NaN for last five overs runs scored.
df2.dropna(inplace = True)

In [34]:
# Now we modify the winner column.
winner_values = np.where(df2['batting_team']==df2['winner'],1,0)
df2['winner'] = winner_values

In [35]:
df2

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,winner,current_score,balls_left,wickets_left,crr,last_five
29,1,Sri Lanka,Australia,4.3,0,0,Melbourne,1,50,93,9,11.111111,50.0
30,1,Sri Lanka,Australia,4.4,0,0,Melbourne,1,50,92,9,10.714286,49.0
31,1,Sri Lanka,Australia,4.5,1,0,Melbourne,1,51,91,9,10.551724,49.0
32,1,Sri Lanka,Australia,4.6,0,0,Melbourne,1,51,90,9,10.200000,49.0
33,1,Sri Lanka,Australia,5.1,4,0,Melbourne,1,55,89,9,10.645161,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
104302,963,Australia,Sri Lanka,17.1,1,0,Colombo,1,121,17,4,7.048544,18.0
104303,963,Australia,Sri Lanka,17.2,3,0,Colombo,1,124,16,4,7.153846,21.0
104304,963,Australia,Sri Lanka,17.3,0,0,Colombo,1,124,15,4,7.085714,21.0
104305,963,Australia,Sri Lanka,17.4,0,0,Colombo,1,124,14,4,7.018868,21.0


In [38]:
# Clean up and create a copy of the DataFrame
final_df2 = df2[['batting_team', 'bowling_team', 'city', 'current_score', 'balls_left', 'wickets_left', 'crr', 'last_five', 'winner']]

In [39]:
final_df2

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,winner
29,Sri Lanka,Australia,Melbourne,50,93,9,11.111111,50.0,1
30,Sri Lanka,Australia,Melbourne,50,92,9,10.714286,49.0,1
31,Sri Lanka,Australia,Melbourne,51,91,9,10.551724,49.0,1
32,Sri Lanka,Australia,Melbourne,51,90,9,10.200000,49.0,1
33,Sri Lanka,Australia,Melbourne,55,89,9,10.645161,53.0,1
...,...,...,...,...,...,...,...,...,...
104302,Australia,Sri Lanka,Colombo,121,17,4,7.048544,18.0,1
104303,Australia,Sri Lanka,Colombo,124,16,4,7.153846,21.0,1
104304,Australia,Sri Lanka,Colombo,124,15,4,7.085714,21.0,1
104305,Australia,Sri Lanka,Colombo,124,14,4,7.018868,21.0,1


In [40]:
# Shuffle the data and pickle dump.
final_df2 = final_df2.sample(final_df2.shape[0])
pickle.dump(final_df2, open('dataset_level3_second_innings.pkl', 'wb'))