# Data Preparation
---

In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd

## Reading Data

In [2]:
# path
path = 'dataset/raw/Match 01 (MI vs RCB) 2021-04-09.csv'

In [3]:
# reading data
match = pd.read_csv(path)
match.head()

Unnamed: 0,inning,batting_team,bowling_team,ball,non_striker,batsman,bowler,extra_runs,batsman_run,total_runs,...,elimination_kind,fielders_caught,match_name,umpires_1,umpires_2,player_of_match,winner,city,venue,dates
0,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.1,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
1,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.2,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
2,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.3,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
3,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.4,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
4,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.5,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09


In [4]:
# analysing data
match.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   inning            250 non-null    object 
 1   batting_team      250 non-null    object 
 2   bowling_team      250 non-null    object 
 3   ball              250 non-null    float64
 4   non_striker       250 non-null    object 
 5   batsman           250 non-null    object 
 6   bowler            250 non-null    object 
 7   extra_runs        250 non-null    int64  
 8   batsman_run       250 non-null    int64  
 9   total_runs        250 non-null    int64  
 10  extras            12 non-null     object 
 11  player_out        17 non-null     object 
 12  elimination_kind  17 non-null     object 
 13  fielders_caught   7 non-null      object 
 14  match_name        250 non-null    object 
 15  umpires_1         250 non-null    object 
 16  umpires_2         250 non-null    object 
 1

## Cleaning Data

In [5]:
teams = {
    'Chennai Super Kings': 'CSK',
    'Delhi Capitals': 'DC',
    'Kolkata Knight Riders': 'KKR',
    'Mumbai Indians': 'MI',
    'Punjab Kings': 'PBKS',
    'Royal Challengers Bangalore': 'RCB',
    'Rajasthan Royals': 'RR',
    'Sunrisers Hyderabad': 'SRH'}

In [6]:
# storing single use data in variable
match_data = dict(
    match_no = int(path.split()[1]),
    date = match['dates'].astype('datetime64').dt.strftime('%d %B %Y')[0],
    day = match['dates'].astype('datetime64').dt.strftime('%A')[0],
    team_1 = match['batting_team'][0],
    team_1_sh = teams[match['batting_team'][0]],
    team_2 = match['bowling_team'][0],
    team_2_sh = teams[match['bowling_team'][0]],
    umpire_1 = match['umpires_1'][0],
    umpire_2 = match['umpires_2'][0],
    player_of_match = match['player_of_match'][0],
    winner = match['winner'][0],
    city = match['city'][0],
    venue = match['venue'][0])

match_data

{'match_no': 1,
 'date': '09 April 2021',
 'day': 'Friday',
 'team_1': 'Mumbai Indians',
 'team_1_sh': 'MI',
 'team_2': 'Royal Challengers Bangalore',
 'team_2_sh': 'RCB',
 'umpire_1': 'KN Ananthapadmanabhan',
 'umpire_2': 'Nitin Menon',
 'player_of_match': 'HV Patel',
 'winner': 'Royal Challengers Bangalore',
 'city': 'Chennai',
 'venue': 'MA Chidambaram Stadium, Chepauk, Chennai'}

In [7]:
# dropping single data column
match.drop(
    ['match_name', 'umpires_1', 'umpires_2', 'player_of_match', 'winner', 'city', 'venue', 'dates'],
    axis=1, inplace=True)

In [8]:
# replacing 'N/A' with '0'
# match.fillna(0, inplace=True)

In [9]:
# changing datatypes
match = match.astype({
    'ball': 'Float32',
    'extra_runs': 'Int8',
    'batsman_run': 'Int8',
    'total_runs': 'Int8'
})

In [10]:
match.head()

Unnamed: 0,inning,batting_team,bowling_team,ball,non_striker,batsman,bowler,extra_runs,batsman_run,total_runs,extras,player_out,elimination_kind,fielders_caught
0,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.1,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,,,,
1,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.2,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,,,,
2,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.3,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,,,,
3,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.4,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,,,,
4,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.5,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,,,,


In [11]:
# analysing data
match.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   inning            250 non-null    object 
 1   batting_team      250 non-null    object 
 2   bowling_team      250 non-null    object 
 3   ball              250 non-null    Float32
 4   non_striker       250 non-null    object 
 5   batsman           250 non-null    object 
 6   bowler            250 non-null    object 
 7   extra_runs        250 non-null    Int8   
 8   batsman_run       250 non-null    Int8   
 9   total_runs        250 non-null    Int8   
 10  extras            12 non-null     object 
 11  player_out        17 non-null     object 
 12  elimination_kind  17 non-null     object 
 13  fielders_caught   7 non-null      object 
dtypes: Float32(1), Int8(3), object(10)
memory usage: 139.1 KB


---

In [12]:
match.head()

Unnamed: 0,inning,batting_team,bowling_team,ball,non_striker,batsman,bowler,extra_runs,batsman_run,total_runs,extras,player_out,elimination_kind,fielders_caught
0,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.1,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,,,,
1,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.2,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,,,,
2,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.3,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,,,,
3,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.4,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,,,,
4,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.5,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,,,,


In [13]:
match.groupby(['inning', 'batting_team'])[['batsman_run']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,batsman_run
inning,batting_team,Unnamed: 2_level_1
1st innings,Mumbai Indians,155
2nd innings,Royal Challengers Bangalore,148


In [14]:
match.groupby(['inning', 'batting_team'])[['extra_runs']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,extra_runs
inning,batting_team,Unnamed: 2_level_1
1st innings,Mumbai Indians,4
2nd innings,Royal Challengers Bangalore,12


In [15]:
match.groupby(['inning', 'batting_team'])[['total_runs']].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs
inning,batting_team,Unnamed: 2_level_1
1st innings,Mumbai Indians,159
2nd innings,Royal Challengers Bangalore,160


In [16]:
match['extras'].value_counts()

wides      7
noballs    3
legbyes    2
Name: extras, dtype: int64

In [28]:
first_inning_wicket = match[match['inning'] == '1st innings']['player_out'].dropna().tolist()
first_inning_wicket

['RG Sharma',
 'SA Yadav',
 'CA Lynn',
 'HH Pandya',
 'Ishan Kishan',
 'KH Pandya',
 'KA Pollard',
 'M Jansen',
 'RD Chahar']

In [30]:
second_inning_wicket = match[match['inning'] == '2nd innings']['player_out'].dropna().tolist()
second_inning_wicket

['Washington Sundar',
 'RM Patidar',
 'V Kohli',
 'GJ Maxwell',
 'Shahbaz Ahmed',
 'DT Christian',
 'KA Jamieson',
 'AB de Villiers']

In [41]:
set(match[match['inning'] == '2nd innings']['batsman'].unique()) - set(match['player_out'].dropna())

{'HV Patel', 'Mohammed Siraj'}