# Data Preparation
---

In [1]:
import re
from datetime import datetime

import numpy as np
import pandas as pd

## Reading Data

In [2]:
# path
path = '../dataset/raw/Match 01 (MI vs RCB) 2021-04-09.csv'

In [3]:
# reading data
match = pd.read_csv(path)
match.head()

Unnamed: 0,inning,batting_team,bowling_team,ball,non_striker,batsman,bowler,extra_runs,batsman_run,total_runs,...,elimination_kind,fielders_caught,match_name,umpires_1,umpires_2,player_of_match,winner,city,venue,dates
0,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.1,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
1,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.2,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
2,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.3,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
3,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.4,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09
4,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.5,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,...,,,MI vs RCB 2021-04-09,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai",2021-04-09


In [4]:
# analysing data
match.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   inning            250 non-null    object 
 1   batting_team      250 non-null    object 
 2   bowling_team      250 non-null    object 
 3   ball              250 non-null    float64
 4   non_striker       250 non-null    object 
 5   batsman           250 non-null    object 
 6   bowler            250 non-null    object 
 7   extra_runs        250 non-null    int64  
 8   batsman_run       250 non-null    int64  
 9   total_runs        250 non-null    int64  
 10  extras            12 non-null     object 
 11  player_out        17 non-null     object 
 12  elimination_kind  17 non-null     object 
 13  fielders_caught   7 non-null      object 
 14  match_name        250 non-null    object 
 15  umpires_1         250 non-null    object 
 16  umpires_2         250 non-null    object 
 1

## Cleaning Data

In [5]:
# storing single use data in variable
match_data = dict(
    match_no = int(path[21:23]),
    date = match['dates'].astype('datetime64').dt.strftime('%d %B %Y')[0],
    day = match['dates'].astype('datetime64').dt.strftime('%A')[0],
    team_1 = re.split(r'[ (.)]', path)[5],
    team_2 = re.split(r'[ (.)]', path)[7],
    umpire_1 = match.loc[:, 'umpires_1'][0],
    umpire_2 = match.loc[:, 'umpires_2'][0],
    player_of_match = match.loc[:, 'player_of_match'][0],
    winner = match.loc[:, 'winner'][0],
    city = match.loc[:, 'city'][0],
    venue = match.loc[:, 'venue'][0])

match_data

{'match_no': 1,
 'date': '09 April 2021',
 'day': 'Friday',
 'team_1': 'MI',
 'team_2': 'RCB',
 'umpire_1': 'KN Ananthapadmanabhan',
 'umpire_2': 'Nitin Menon',
 'player_of_match': 'HV Patel',
 'winner': 'Royal Challengers Bangalore',
 'city': 'Chennai',
 'venue': 'MA Chidambaram Stadium, Chepauk, Chennai'}

In [6]:
# dropping single data column
match.drop(
    ['match_name', 'umpires_1', 'umpires_2', 'player_of_match', 'winner', 'city', 'venue', 'dates'],
    axis=1, inplace=True)

In [7]:
# replacing 'N/A' with '0'
match.fillna(0, inplace=True)

In [8]:
# changing datatypes
match = match.astype({
    'inning': 'category',
    'batting_team': 'category',
    'bowling_team': 'category',
    'ball': 'float32',
    'non_striker': 'category',
    'batsman': 'category',
    'bowler': 'category',
    'extra_runs': 'int8',
    'batsman_run': 'int8',
    'total_runs': 'int8'
})

In [9]:
match.head()

Unnamed: 0,inning,batting_team,bowling_team,ball,non_striker,batsman,bowler,extra_runs,batsman_run,total_runs,extras,player_out,elimination_kind,fielders_caught
0,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.1,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,0,0,0,0
1,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.2,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,0,0,0,0
2,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.3,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,0,0,0,0
3,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.4,CA Lynn,RG Sharma,Mohammed Siraj,0,2,2,0,0,0,0
4,1st innings,Mumbai Indians,Royal Challengers Bangalore,0.5,CA Lynn,RG Sharma,Mohammed Siraj,0,0,0,0,0,0,0


In [10]:
# analysing data
match.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   inning            250 non-null    category
 1   batting_team      250 non-null    category
 2   bowling_team      250 non-null    category
 3   ball              250 non-null    float32 
 4   non_striker       250 non-null    category
 5   batsman           250 non-null    category
 6   bowler            250 non-null    category
 7   extra_runs        250 non-null    int8    
 8   batsman_run       250 non-null    int8    
 9   total_runs        250 non-null    int8    
 10  extras            250 non-null    object  
 11  player_out        250 non-null    object  
 12  elimination_kind  250 non-null    object  
 13  fielders_caught   250 non-null    object  
dtypes: category(6), float32(1), int8(3), object(4)
memory usage: 41.7 KB


---