## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Loading the combined T20Is csv

In [2]:
df = pd.read_csv('../Material/T20Is.csv')

## Feature Engineering (Adding features that will help in Predictions)

In [3]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1217744,2019/20,2020-03-04,Terdthai Cricket Ground,1,0.1,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,,,,
1,1217744,2019/20,2020-03-04,Terdthai Cricket Ground,1,0.2,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,,,,
2,1217744,2019/20,2020-03-04,Terdthai Cricket Ground,1,0.3,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,,,,
3,1217744,2019/20,2020-03-04,Terdthai Cricket Ground,1,0.4,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,,,,
4,1217744,2019/20,2020-03-04,Terdthai Cricket Ground,1,0.5,Thailand,Nepal,DF Jacobs,N Pathan,S Lamichhane,0,0,,,,,,,,,


In [4]:
df.tail()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
398612,1142914,2018,2018-07-02,Harare Sports Club,2,10.1,Australia,Pakistan,TM Head,AJ Finch,Shadab Khan,1,0,,,,,,,,,
398613,1142914,2018,2018-07-02,Harare Sports Club,2,10.2,Australia,Pakistan,AJ Finch,TM Head,Shadab Khan,1,0,,,,,,,,,
398614,1142914,2018,2018-07-02,Harare Sports Club,2,10.3,Australia,Pakistan,TM Head,AJ Finch,Shadab Khan,1,0,,,,,,,,,
398615,1142914,2018,2018-07-02,Harare Sports Club,2,10.4,Australia,Pakistan,AJ Finch,TM Head,Shadab Khan,6,0,,,,,,,,,
398616,1142914,2018,2018-07-02,Harare Sports Club,2,10.5,Australia,Pakistan,AJ Finch,TM Head,Shadab Khan,4,0,,,,,,,,,


In [5]:
df.batting_team.unique()

array(['Thailand', 'Nepal', 'Malta', 'Belgium', 'Namibia', 'Uganda',
       'Japan', 'South Korea', 'Oman', 'Hong Kong', 'Australia',
       'Pakistan', 'Ireland', 'Afghanistan', 'West Indies', 'England',
       'Portugal', 'Spain', 'India', 'South Africa', 'Zimbabwe',
       'Botswana', 'Ghana', 'Bahrain', 'Philippines',
       'United Arab Emirates', 'Sri Lanka', 'Bangladesh', 'New Zealand',
       'Canada', 'Luxembourg', 'Hungary', 'Netherlands', 'Nigeria',
       'Sierra Leone', 'Tanzania', 'Mozambique', 'Belize',
       'United States of America', 'Czech Republic', 'Bulgaria', 'Norway',
       'Guernsey', 'Bermuda', 'Jersey', 'Panama', 'Cameroon', 'Scotland',
       'Romania', 'Papua New Guinea', 'Germany', 'Kenya', 'Singapore',
       'Gibraltar', 'Indonesia', 'Malaysia', 'Italy', 'Qatar', 'Finland',
       'Sweden', 'Bahamas', 'Argentina', 'Maldives', 'Saudi Arabia',
       'Denmark', 'Malawi', 'Vanuatu', 'Serbia', 'Greece', 'ICC World XI',
       'Bhutan', 'Rwanda', 'Seychelles

In [6]:
major_teams = ['Australia', 'Bangladesh', 'England', 'India', 'New Zealand', 'Pakistan', 'South Africa', 'Sri Lanka', 'West Indies']

In [7]:
df = df[df['batting_team'].isin(major_teams)]
df = df[df['bowling_team'].isin(major_teams)]

In [8]:
df['batting_team'].unique()

array(['Australia', 'Pakistan', 'West Indies', 'England', 'India',
       'South Africa', 'Sri Lanka', 'Bangladesh', 'New Zealand'],
      dtype=object)

In [9]:
df['bowling_team'].unique()

array(['Pakistan', 'Australia', 'England', 'West Indies', 'India',
       'South Africa', 'Bangladesh', 'Sri Lanka', 'New Zealand'],
      dtype=object)

In [10]:
df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

### Creating the required feature data using the current data

In [11]:
df = df.sort_values(['match_id', 'innings'], ascending=[True, True])
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
63598,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,GO Jones,B Lee,0,0,,,,,,,,,
63599,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,GO Jones,B Lee,1,0,,,,,,,,,
63600,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,
63601,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,
63602,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,


In [12]:
df['over'] = df['ball']
df['overs'] = df['ball']

In [13]:
df['over'] = df['over'].apply(np.int64)

In [14]:
df['ball'] = df['ball'].astype(str).str[-1:]
df['ball'] = df['ball'].apply(np.int64)

In [15]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,over,overs
63598,211028,2005,2005-06-13,The Rose Bowl,1,1,England,Australia,ME Trescothick,GO Jones,B Lee,0,0,,,,,,,,,,0,0.1
63599,211028,2005,2005-06-13,The Rose Bowl,1,2,England,Australia,ME Trescothick,GO Jones,B Lee,1,0,,,,,,,,,,0,0.2
63600,211028,2005,2005-06-13,The Rose Bowl,1,3,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,,0,0.3
63601,211028,2005,2005-06-13,The Rose Bowl,1,4,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,,0,0.4
63602,211028,2005,2005-06-13,The Rose Bowl,1,5,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,,0,0.5


In [16]:
df['total_runs'] = df['runs_off_bat'] + df['extras']

In [17]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,over,overs,total_runs
63598,211028,2005,2005-06-13,The Rose Bowl,1,1,England,Australia,ME Trescothick,GO Jones,B Lee,0,0,,,,,,,,,,0,0.1,0
63599,211028,2005,2005-06-13,The Rose Bowl,1,2,England,Australia,ME Trescothick,GO Jones,B Lee,1,0,,,,,,,,,,0,0.2,1
63600,211028,2005,2005-06-13,The Rose Bowl,1,3,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,,0,0.3,0
63601,211028,2005,2005-06-13,The Rose Bowl,1,4,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,,0,0.4,0
63602,211028,2005,2005-06-13,The Rose Bowl,1,5,England,Australia,GO Jones,ME Trescothick,B Lee,0,0,,,,,,,,,,0,0.5,0


In [18]:
df = df[['match_id', 'innings', 'batting_team', 'bowling_team', 'overs', 'over', 'ball', 'total_runs', 'player_dismissed']].rename(columns={'match_id':'id'})

In [19]:
df = df.replace(np.nan, 0)

### Total runs scored by the team in an innings

In [20]:
df['total'] = df.groupby(['id', 'innings'])['total_runs'].transform('sum')
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total
63598,211028,1,England,Australia,0.1,0,1,0,0,179
63599,211028,1,England,Australia,0.2,0,2,1,0,179
63600,211028,1,England,Australia,0.3,0,3,0,0,179
63601,211028,1,England,Australia,0.4,0,4,0,0,179
63602,211028,1,England,Australia,0.5,0,5,0,0,179


### Runs scored by the team upto the current ball

In [21]:
df['total_score'] = df.groupby(['id', 'innings'])['total_runs'].apply(lambda x: x.cumsum())
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1


### Runs scored by a team in the previous 30 balls

In [22]:
tmp = df.groupby(['id', 'innings'])['total_runs'].rolling(min_periods=1, window=30).sum().reset_index()
tmp[['total_runs']].head()

Unnamed: 0,total_runs
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0


In [23]:
df['prev_30_runs'] = tmp['total_runs'].to_list()
df.head(10)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0
63603,211028,1,England,Australia,0.6,0,6,1,0,179,2,2.0
63604,211028,1,England,Australia,0.7,0,7,2,0,179,4,4.0
63605,211028,1,England,Australia,1.1,1,1,0,0,179,4,4.0
63606,211028,1,England,Australia,1.2,1,2,0,0,179,4,4.0
63607,211028,1,England,Australia,1.3,1,3,1,0,179,5,5.0


### Wickets fallen in the previous 30 balls

In [24]:
df['player_dismissed'].unique()

array([0, 'GO Jones', 'A Flintoff', 'KP Pietersen', 'MP Vaughan',
       'ME Trescothick', 'AJ Strauss', 'VS Solanki', 'PD Collingwood',
       'AC Gilchrist', 'ML Hayden', 'MJ Clarke', 'A Symonds',
       'MEK Hussey', 'RT Ponting', 'DR Martyn', 'JN Gillespie', 'B Lee',
       'GD McGrath', 'SM Katich', 'SP Fleming', 'MS Sinclair',
       'BB McCullum', 'CD McMillan', 'CL Cairns', 'HJH Marshall',
       'AR Adams', 'SB Styris', 'JW Wilson', 'KD Mills', 'GC Smith',
       'JH Kallis', 'HH Gibbs', 'JM Kemp', 'SM Pollock', 'MV Boucher',
       'AG Prince', 'CK Langeveldt', 'JA Morkel', 'M Ntini', 'NJ Astle',
       'IR Bell', 'JWM Dalrymple', 'CMW Read', 'Shoaib Malik',
       'Younis Khan', 'Shahid Afridi', 'Mohammad Yousuf',
       'Mohammad Hafeez', 'WU Tharanga', 'DPMD Jayawardene',
       'ST Jayasuriya', 'TM Dilshan', 'RP Arnold', 'KC Sangakkara',
       'MF Maharoof', 'CK Kapugedera', 'SL Malinga', 'CRD Fernando',
       'JR Hopes', 'HH Dippenaar', 'AJ Hall', 'J Botha', 'GJP Kruge

In [25]:
df['player_dismissed'] = np.where(df['player_dismissed']==0, 0, 1)

In [26]:
tmp = df.groupby(['id', 'innings'])['player_dismissed'].rolling(min_periods=1, window=30).sum().reset_index()

In [27]:
df['prev_30_wickets'] = tmp['player_dismissed'].to_list()
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0


### Wickets fallen till current ball

In [28]:
df['total_wickets'] = df.groupby(['id', 'innings'])['player_dismissed'].apply(lambda x: x.cumsum())
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0


### Dot balls in the previous 30 balls

In [29]:
df['prev_30_dot_balls'] = df['total_runs']
df['prev_30_dot_balls'] = np.where(df['prev_30_dot_balls']==0, 1, 0)

tmp = df.groupby(['id', 'innings'])['prev_30_dot_balls'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_dot_balls'] = tmp['prev_30_dot_balls'].to_list()
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0,1.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0,1.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0,2.0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0,3.0
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0,4.0


### Boundaries (4s + 6s) in the previous 30 balls

In [30]:
df['prev_30_boundaries'] = df['total_runs']
df['prev_30_boundaries'] = np.where(df['prev_30_boundaries']>3, 1, 0)

tmp = df.groupby(['id', 'innings'])['prev_30_boundaries'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_boundaries'] = tmp['prev_30_boundaries'].to_list()
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0,1.0,0.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0,1.0,0.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0,2.0,0.0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0,3.0,0.0
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0,4.0,0.0


### Run Rate till current ball

In [31]:
df['total_balls'] = df['over']*6 + df['ball']
df['run_rate'] = 6*df['total_score']/df['total_balls']
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0,1.0,0.0,1,0.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0,1.0,0.0,2,3.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0,2.0,0.0,3,2.0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0,3.0,0.0,4,1.5
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0,4.0,0.0,5,1.2


### Run Rate in previous 30 balls

In [32]:
df['prev_30_run_rate'] = np.where(df['total_balls']<31, 6*df['total_score']/df['total_balls'], 6*df['prev_30_runs']/30)
df.head(100)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0,1.0,0.0,1,0.0,0.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0,1.0,0.0,2,3.0,3.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0,2.0,0.0,3,2.0,2.0
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0,3.0,0.0,4,1.5,1.5
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0,4.0,0.0,5,1.2,1.2
63603,211028,1,England,Australia,0.6,0,6,1,0,179,2,2.0,0.0,0,4.0,0.0,6,2.0,2.0
63604,211028,1,England,Australia,0.7,0,7,2,0,179,4,4.0,0.0,0,4.0,0.0,7,3.428571,3.428571
63605,211028,1,England,Australia,1.1,1,1,0,0,179,4,4.0,0.0,0,5.0,0.0,7,3.428571,3.428571
63606,211028,1,England,Australia,1.2,1,2,0,0,179,4,4.0,0.0,0,6.0,0.0,8,3.0,3.0
63607,211028,1,England,Australia,1.3,1,3,1,0,179,5,5.0,0.0,0,6.0,0.0,9,3.333333,3.333333


### Strike Rate in previous 30 balls

In [33]:
df['prev_30_strike_rate'] = np.where(df['total_balls']<31, 100*df['total_score']/df['total_balls'], 100*df['prev_30_runs']/30)
df.head(100)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate,prev_30_strike_rate
63598,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0,1.0,0.0,1,0.0,0.0,0.0
63599,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0,1.0,0.0,2,3.0,3.0,50.0
63600,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0,2.0,0.0,3,2.0,2.0,33.333333
63601,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0,3.0,0.0,4,1.5,1.5,25.0
63602,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0,4.0,0.0,5,1.2,1.2,20.0
63603,211028,1,England,Australia,0.6,0,6,1,0,179,2,2.0,0.0,0,4.0,0.0,6,2.0,2.0,33.333333
63604,211028,1,England,Australia,0.7,0,7,2,0,179,4,4.0,0.0,0,4.0,0.0,7,3.428571,3.428571,57.142857
63605,211028,1,England,Australia,1.1,1,1,0,0,179,4,4.0,0.0,0,5.0,0.0,7,3.428571,3.428571,57.142857
63606,211028,1,England,Australia,1.2,1,2,0,0,179,4,4.0,0.0,0,6.0,0.0,8,3.0,3.0,50.0
63607,211028,1,England,Australia,1.3,1,3,1,0,179,5,5.0,0.0,0,6.0,0.0,9,3.333333,3.333333,55.555556


In [34]:
df.columns

Index(['id', 'innings', 'batting_team', 'bowling_team', 'overs', 'over',
       'ball', 'total_runs', 'player_dismissed', 'total', 'total_score',
       'prev_30_runs', 'prev_30_wickets', 'total_wickets', 'prev_30_dot_balls',
       'prev_30_boundaries', 'total_balls', 'run_rate', 'prev_30_run_rate',
       'prev_30_strike_rate'],
      dtype='object')

In [35]:
convert_dict = {'prev_30_runs': int,
                'prev_30_wickets': int,
                'prev_30_dot_balls': int,
                'prev_30_boundaries': int,
                'prev_30_run_rate': int,
                'prev_30_strike_rate': int
               }
df = df.astype(convert_dict)

In [36]:
df.to_csv('../Material/T20Is_Featured.csv', index=None)