# Data exploration:

In [12]:
import csv
import pandas as pd

In [13]:
input = '/content/drive/MyDrive/Inframind/Customer engagement/original_dataset/original_dataset.csv'
df = pd.read_csv(input)
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,overs_done,delivery,ball,batsman,non_striker,bowler,wide_runs,bye_runs,legbye_runs,noball_runs,batsman_runs,extra_runs,total,player_dismissed,dismissal_kind,fielder
0,1,1,Australia,Pakistan,1,0,1,0.1,DA Warner,TM Head,Mohammad Amir,0,0,0,0,0,0,0,,,
1,1,1,Australia,Pakistan,1,0,2,0.2,DA Warner,TM Head,Mohammad Amir,0,0,0,0,0,0,0,,,
2,1,1,Australia,Pakistan,1,0,3,0.3,DA Warner,TM Head,Mohammad Amir,0,0,0,0,0,0,0,,,
3,1,1,Australia,Pakistan,1,0,4,0.4,DA Warner,TM Head,Mohammad Amir,0,0,0,0,0,0,0,,,
4,1,1,Australia,Pakistan,1,0,5,0.5,DA Warner,TM Head,Mohammad Amir,1,0,0,0,0,1,1,,,


In [15]:
# get unique values in countries 
df.batting_team.unique() 

array(['Australia', 'Pakistan', 'New Zealand', 'Afghanistan', 'Scotland',
       'Hong Kong', 'Zimbabwe', 'India', 'Bangladesh', 'South Africa',
       'England', 'Sri Lanka', 'Papua New Guinea', 'West Indies',
       'Ireland', 'United Arab Emirates', 'Nepal',
       'United States of America', 'Namibia', 'Oman', 'Netherlands',
       'Bermuda', 'Canada', 'Kenya', 'Asia XI', 'Africa XI'], dtype=object)

# Data Filtration:

In [14]:
# drop columns 
df.drop(['over', 'overs_done', 'delivery' , 'non_striker','wide_runs','bye_runs','legbye_runs','noball_runs','extra_runs','total','fielder','dismissal_kind'], axis = 1, inplace=True) 

In [16]:
# if player dismessed is not NaN, it means a it's a wicket. 
# update the column as 1 if wicket is fallen

df.loc[df['player_dismissed'].isnull(),'is_wicket'] = int(0)
df.loc[df['player_dismissed'].notnull(), 'is_wicket'] = int(1)

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0.0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0.0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0.0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0.0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0.0


In [17]:
df['is_wicket'] = df['is_wicket'].astype(int) 

In [18]:
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0


# Feature Engineering:
## Generate new intermediate features
We use the existing features to generate new intermediate features that better respresent the current state and momentum of the match. 
### Intermediate features:
1. wickets_so_far
2. last_3_overs_runs
3. last_3_overs_wickets
4. last_6_balls_runs
5. last_6_balls_wickets
6. balls_in_partnership
7. runs_so_far
8. run_rate_last_5_overs
9. balls_since_last_boundary

In [19]:
# adding column for wickets so far
w_so_far = []
wsf = 0
for i in df.index:
  if df['ball'][i]==0.1:
    w_so_far.append(0)
    wsf = 0
  else:
    wsf += df['is_wicket'][i-1]
    w_so_far.append(wsf)

df['wickets_so_far'] = w_so_far

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0


In [20]:
# adding column for last_3_overs_runs
last_3_overs_runs = []
for i in df.index:
  if df['ball'][i]>=0.1 and df['ball'][i]< 3.1:
    last_3_overs_runs.append(14)
  else:
    # last_3_overs_runs.append(df['batsman_runs'][i].rolling(min_periods=1, window=12).sum())
    start = i - 19
    total_runs = 0
    while (start <= i-1):
      total_runs += df['batsman_runs'][start]
      start +=1
    last_3_overs_runs.append(total_runs)

df['last_3_overs_runs'] = last_3_overs_runs

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14


In [21]:
# adding column for last_6_balls_runs
last_6_balls_runs = []
for i in df.index:
  if df['ball'][i]>=0.1 and df['ball'][i]< 1.1:
    last_6_balls_runs.append(3)
  else:
    start = i - 7
    total_runs = 0
    while (start <= i-1):
      total_runs += df['batsman_runs'][start]
      start +=1
    last_6_balls_runs.append(total_runs)

df['last_6_balls_runs'] = last_6_balls_runs

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3


In [22]:
# adding column for last_3_overs_wickets
# adding column for wickets so far
last_3_overs_wickets = []
for i in df.index:
  if df['ball'][i]>=0.1 and df['ball'][i]< 3.1:
    last_3_overs_wickets.append(0)
  else:
    # last_3_overs_runs.append(df['batsman_runs'][i].rolling(min_periods=1, window=12).sum())
    start = i - 19
    total_wickets = 0
    while (start <= i-1):
      total_wickets += df['is_wicket'][start]
      start +=1
    last_3_overs_wickets.append(total_wickets)

df['last_3_overs_wickets'] = last_3_overs_wickets

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0


In [23]:
# adding column for last_6_balls_wickets
last_6_balls_wickets = []
for i in df.index:
  if df['ball'][i]>=0.1 and df['ball'][i]< 1.1:
    last_6_balls_wickets.append(0)
  else:
    start = i - 7
    total_wickets = 0
    while (start <= i-1):
      total_wickets += df['is_wicket'][start]
      start +=1
    last_6_balls_wickets.append(total_wickets)

df['last_6_balls_wickets'] = last_6_balls_wickets

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0


In [24]:
# add column balls_in_partnership
balls_in_partnership = []
balls = 0
for i in df.index:
  if df['ball'][i]==0.1:
    balls = 0
    balls_in_partnership.append(0)
  else:
    if df['is_wicket'][i-1] == 1:
      balls = 0
      balls_in_partnership.append(0)
    else:
      balls += 1
      balls_in_partnership.append(balls)
 
df['balls_in_partnership'] = balls_in_partnership

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4


In [25]:
# add column runs so far
runs_so_far = []
rsf = 0
for i in df.index:
  if df['ball'][i]==0.1:
    rsf = 0
    runs_so_far.append(0)
  else:
    rsf += df['batsman_runs'][i-1]
    runs_so_far.append(rsf)

df['runs_so_far'] = runs_so_far

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0


In [26]:
# add column run_rate_last_5_overs
run_rate_last_5_overs = []

for i in df.index:
  if df['ball'][i] < 5.1:
    run_rate_last_5_overs.append(df['runs_so_far'][i]/df['ball'][i])
  else:
    run_rate_last_5_overs.append((df['runs_so_far'][i]-df['runs_so_far'][i-30])/5)

df['run_rate_last_5_overs'] = run_rate_last_5_overs

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0


In [27]:
# add column balls_since_last_boundary

balls_since_last_boundary = []

bslb = 0
for i in df.index:
  if df['ball'][i]==0.1:
    bslb = 0
    balls_since_last_boundary.append(0)
  # check if prev was boundary
  elif df['batsman_runs'][i-1] >= 4:
    bslb = 0
    balls_since_last_boundary.append(0)
  else:
    bslb += 1
    balls_since_last_boundary.append(bslb)

df['balls_since_last_boundary'] = balls_since_last_boundary

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4


## Generate Predictive Features
These are the features that can be predicted using the model. Further data analysis will help decide which predictive feature to proceed with.
### Predictive features
1. wicket_next_6_balls
2. wicket_next_12_balls
3. boundary_next_6_balls
4. wicket_next_6_balls
5. boundary_this_over
6. wicket_this_over

In [28]:
# add column that is to be predicted
# wicket in next 12 balls
wicket_next_6_balls = []
for i in df.index:
    start = i 
    wicket = False
    while (start <= i+6):
      # check if the innings is the same 
      # also check if i is not out of bounds
      if (start in df.index and df['inning'][start]==df['inning'][i] and df['is_wicket'][start] == 1):
        wicket = True
      start +=1

    if(wicket == True):
      wicket_next_6_balls.append(1)
    else:
      wicket_next_6_balls.append(0)
    
df['wicket_next_6_balls'] = wicket_next_6_balls

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0


In [29]:
# add column that is to be predicted
# wicket in next 12 balls
wicket_next_12_balls = []
for i in df.index:
    start = i 
    wicket = False
    while (start <= i+12):
      # check if the innings is the same 
      # also check if i is not out of bounds
      if (start in df.index and df['inning'][start]==df['inning'][i] and df['is_wicket'][start] == 1):
        wicket = True
      start += 1

    if(wicket == True):
      wicket_next_12_balls.append(1)
    else:
      wicket_next_12_balls.append(0)
    
df['wicket_next_12_balls'] = wicket_next_12_balls

df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0,0


In [30]:
# add column that is to be predicted
# boundry in next 12 balls

boundary_next_6_balls = []
for i in df.index:
    start = i 
    runs_scored = 0
    while (start <= i+6):
      # check if the innings is the same 
      # also check if i is not out of bounds
      if (start in df.index and df['inning'][start]==df['inning'][i]):
        # runs_scored_by_boundaries += df['batsman_runs'][start]
        runs_scored += df['batsman_runs'][start]

      start +=1

    if(runs_scored >= 10):
      boundary_next_6_balls.append(1)
    else:
      boundary_next_6_balls.append(0)
    
df['boundary_next_6_balls'] = boundary_next_6_balls
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls,boundary_next_6_balls
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0,0,0


In [31]:
# add column that is to be predicted
# boundry in next 12 balls

boundary_next_12_balls = []
for i in df.index:
    start = i 
    runs_scored = 0
    while (start <= i+12):
      # check if the innings is the same 
      # also check if i is not out of bounds
      if (start in df.index and df['inning'][start]==df['inning'][i]):
        # runs_scored_by_boundaries += df['batsman_runs'][start]
        runs_scored += df['batsman_runs'][start]

      start +=1

    if(runs_scored >= 17):
      boundary_next_12_balls.append(1)
    else:
      boundary_next_12_balls.append(0)
    
df['boundary_next_12_balls'] = boundary_next_12_balls
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls,boundary_next_6_balls,boundary_next_12_balls
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0,0,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0,0,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0,0,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0,0,0,0


In [32]:
# add predictive column
# boundary in this over
# two 4s or one 6 in this over

boundary_this_over = []
for i in df.index:
    start = i 
    runs_scored_by_boundaries = 0
    while ((start in df.index) and int(df['ball'][start]) == int(df['ball'][i])):
      # check if the innings is the same 
      # also check if i is not out of bounds
      if ((start in df.index) and df['inning'][start]==df['inning'][i] and df['batsman_runs'][start] >=4):
        runs_scored_by_boundaries += df['batsman_runs'][start]
      start +=1

    if (runs_scored_by_boundaries >= 6):
      boundary_this_over.append(1)
    else:
      boundary_this_over.append(0)
    
df['boundary_this_over'] = boundary_this_over
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls,boundary_next_6_balls,boundary_next_12_balls,boundary_this_over
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0,0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0,0,0,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0,0,0,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0,0,0,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0,0,0,0,0


In [33]:
# df1 = df.iloc[929534:929544]
# # add predictive column
# # boundary in this over
# # two 4s or one 6 in this over

# boundary_this_over = []
# for i in df1.index:
#     start = i 
#     runs_scored_by_boundaries = 0
#     while ((start in df1.index) and int(df1['ball'][start]) == int(df1['ball'][i])):
#       # check if the innings is the same 
#       # also check if i is not out of bounds
#       if ((start in df1.index) and df1['inning'][start]==df1['inning'][i] and df1['batsman_runs'][start] >=4):
#         runs_scored_by_boundaries += df1['batsman_runs'][start]
#       start +=1

#     if (runs_scored_by_boundaries >= 6):
#       boundary_this_over.append(1)
#     else:
#       boundary_this_over.append(0)
    
# df1['boundary_this_over'] = boundary_this_over
# df1.head(10)

In [34]:
# add predictive column
# wicket in this over
# two 4s or one 6 in this over

wicket_this_over = []
for i in df.index:
    start = i 
    wicket = False
    while (start in df.index and int(df['ball'][start]) == int(df['ball'][i])):
      # check if the innings is the same 
      # also check if i is not out of bounds
      if (start in df.index and df['inning'][start]==df['inning'][i] and df['is_wicket'][start] == 1):
        wicket = True
      start +=1

    if (wicket == True):
      wicket_this_over.append(1)
    else:
      wicket_this_over.append(0)
    
df['wicket_this_over'] = wicket_this_over
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls,boundary_next_6_balls,boundary_next_12_balls,boundary_this_over,wicket_this_over
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0,0,0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0,0,0,0,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0,0,0,0,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0,0,0,0,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0,0,0,0,0,0


# Data cleaning:

In [35]:
# remove certain countries
countries_to_remove = [ 'Scotland', 'Hong Kong',
       'Zimbabwe', 'Papua New Guinea', 
       'Ireland', 'United Arab Emirates', 'Nepal',
       'United States of America', 'Namibia', 'Oman', 'Netherlands',
       'Bermuda', 'Canada', 'Kenya', 'Asia XI', 'Africa XI']
df = df[~df['batting_team'].isin(countries_to_remove)]

df = df[~df['bowling_team'].isin(countries_to_remove)]

# df.to_csv('/content/drive/MyDrive/Inframind/Customer engagement/original_dataset/processed_dataset.csv',index=False)
#imp reset index as we will access it later
# df.reset_index()
df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls,boundary_next_6_balls,boundary_next_12_balls,boundary_this_over,wicket_this_over
0,1,1,Australia,Pakistan,0.1,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,0,0,0.0,0,0,0,0,0,0,0
1,1,1,Australia,Pakistan,0.2,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,1,0,0.0,1,0,0,0,0,0,0
2,1,1,Australia,Pakistan,0.3,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,2,0,0.0,2,0,0,0,0,0,0
3,1,1,Australia,Pakistan,0.4,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,3,0,0.0,3,0,0,0,0,0,0
4,1,1,Australia,Pakistan,0.5,DA Warner,Mohammad Amir,0,,0,0,14,3,0,0,4,0,0.0,4,0,0,0,0,0,0


In [39]:
df.tail(10)

Unnamed: 0,match_id,inning,batting_team,bowling_team,ball,batsman,bowler,batsman_runs,player_dismissed,is_wicket,wickets_so_far,last_3_overs_runs,last_6_balls_runs,last_3_overs_wickets,last_6_balls_wickets,balls_in_partnership,runs_so_far,run_rate_last_5_overs,balls_since_last_boundary,wicket_next_6_balls,wicket_next_12_balls,boundary_next_6_balls,boundary_next_12_balls,boundary_this_over,wicket_this_over
929553,1596,2,South Africa,England,45.5,KJ Abbott,SCJ Broad,0,,0,8,38,16,0,0,26,244,9.4,1,0,1,0,0,0,0
929554,1596,2,South Africa,England,45.6,KJ Abbott,SCJ Broad,0,,0,8,38,16,0,0,27,244,9.4,2,0,1,0,0,0,0
929555,1596,2,South Africa,England,46.1,CH Morris,BA Stokes,0,,0,8,36,15,0,0,28,244,9.0,3,1,1,0,0,1,0
929556,1596,2,South Africa,England,46.2,CH Morris,BA Stokes,0,,0,8,36,15,0,0,29,244,8.8,4,1,1,1,0,1,0
929557,1596,2,South Africa,England,46.3,CH Morris,BA Stokes,6,,0,8,35,9,0,0,30,244,8.8,5,1,1,1,0,1,0
929558,1596,2,South Africa,England,46.4,CH Morris,BA Stokes,0,,0,8,41,11,0,0,31,250,9.8,0,1,1,0,0,0,0
929559,1596,2,South Africa,England,46.5,CH Morris,BA Stokes,1,,0,8,35,7,0,0,32,250,9.4,1,1,1,0,0,0,0
929560,1596,2,South Africa,England,46.6,KJ Abbott,BA Stokes,0,,0,8,35,7,0,0,33,251,9.6,2,1,1,0,0,0,0
929561,1596,2,South Africa,England,47.1,CH Morris,AU Rashid,0,CH Morris,1,8,35,7,0,0,34,251,9.6,3,1,1,0,0,0,1
929562,1596,2,South Africa,England,47.2,Imran Tahir,AU Rashid,4,,0,9,31,7,1,1,0,251,9.6,4,0,0,0,0,0,0


In [36]:
df.boundary_this_over.value_counts()

0    663458
1     51256
Name: boundary_this_over, dtype: int64

# Export data:

In [37]:
df.to_csv('/content/drive/MyDrive/Inframind/Customer engagement/original_dataset/pre_processed_dataset.csv',index=False)
