# Data Cleaning
---

In [1]:
import re
from os import listdir, mkdir
from os.path import join, exists
from datetime import datetime

import numpy as np
import pandas as pd

In [2]:
reading_path = join('dataset', 'raw')
writing_path = join('dataset', 'csv')

if not exists(writing_path):
    mkdir(writing_path)

In [3]:
files = listdir(reading_path)
files[:5]

['Match 01 (MI vs RCB) 2021-04-09.csv',
 'Match 02 (CSK vs DC) 2021-04-10.csv',
 'Match 03 (SRH vs KKR) 2021-04-11.csv',
 'Match 04 (RR vs PBKS) 2021-04-12.csv',
 'Match 05 (KKR vs MI) 2021-04-13.csv']

In [4]:
teams = {
    'Chennai Super Kings': 'CSK',
    'Delhi Capitals': 'DC',
    'Kolkata Knight Riders': 'KKR',
    'Mumbai Indians': 'MI',
    'Punjab Kings': 'PBKS',
    'Royal Challengers Bangalore': 'RCB',
    'Rajasthan Royals': 'RR',
    'Sunrisers Hyderabad': 'SRH'}

In [5]:
def clean_data(file_name):
    print(file)
    match = pd.read_csv(join(reading_path, file_name))
    
    match_data = dict(
        match_no = int(file_name.split()[1]),
        team_1 = match['batting_team'][0],
        team_1_sh = teams[match['batting_team'][0]],
        team_2 = match['bowling_team'][0],
        team_2_sh = teams[match['bowling_team'][0]],
        date = match['dates'].astype('datetime64').dt.strftime('%d %B %Y')[0],
        day = match['dates'].astype('datetime64').dt.strftime('%A')[0],
        umpire_1 = match.loc[:, 'umpires_1'][0],
        umpire_2 = match.loc[:, 'umpires_2'][0],
        player_of_match = match.loc[:, 'player_of_match'][0],
        winner = match.loc[:, 'winner'][0],
        city = match.loc[:, 'city'][0],
        venue = match.loc[:, 'venue'][0])
    
    match.drop(
        ['match_name', 'umpires_1', 'umpires_2', 'player_of_match', 'winner', 'city', 'venue', 'dates'],
        axis=1, inplace=True)
        
    match.to_csv(join(writing_path, file_name))
    
    return ' '.join(re.split(r'[ .]', file_name)[:5]), match_data

In [6]:
all_match_data = dict()

for file in files:
    name, data = clean_data(file)
    all_match_data[name] = data

Match 01 (MI vs RCB) 2021-04-09.csv
Match 02 (CSK vs DC) 2021-04-10.csv
Match 03 (SRH vs KKR) 2021-04-11.csv
Match 04 (RR vs PBKS) 2021-04-12.csv
Match 05 (KKR vs MI) 2021-04-13.csv
Match 06 (SRH vs RCB) 2021-04-14.csv
Match 07 (RR vs DC) 2021-04-15.csv
Match 08 (PBKS vs CSK) 2021-04-16.csv
Match 09 (MI vs SRH) 2021-04-17.csv
Match 10 (RCB vs KKR) 2021-04-18.csv
Match 11 (DC vs PBKS) 2021-04-18.csv
Match 12 (CSK vs RR) 2021-04-19.csv
Match 13 (DC vs MI) 2021-04-20.csv
Match 14 (PBKS vs SRH) 2021-04-21.csv
Match 15 (KKR vs CSK) 2021-04-21.csv
Match 16 (RCB vs RR) 2021-04-22.csv
Match 17 (PBKS vs MI) 2021-04-23.csv
Match 18 (RR vs KKR) 2021-04-24.csv
Match 19 (CSK vs RCB) 2021-04-25.csv
Match 20 (SRH vs DC) 2021-04-25.csv
Match 21 (PBKS vs KKR) 2021-04-26.csv
Match 22 (DC vs RCB) 2021-04-27.csv
Match 23 (CSK vs SRH) 2021-04-28.csv
Match 24 (MI vs RR) 2021-04-29.csv
Match 25 (DC vs KKR) 2021-04-29.csv
Match 26 (PBKS vs RCB) 2021-04-30.csv
Match 27 (MI vs CSK) 2021-05-01.csv
Match 28 (RR v

In [7]:
all_match_df = pd.DataFrame(all_match_data.values(), index=all_match_data.keys())
all_match_df.head()

Unnamed: 0,match_no,team_1,team_1_sh,team_2,team_2_sh,date,day,umpire_1,umpire_2,player_of_match,winner,city,venue
Match 01 (MI vs RCB),1,Mumbai Indians,MI,Royal Challengers Bangalore,RCB,09 April 2021,Friday,KN Ananthapadmanabhan,Nitin Menon,HV Patel,Royal Challengers Bangalore,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai"
Match 02 (CSK vs DC),2,Chennai Super Kings,CSK,Delhi Capitals,DC,10 April 2021,Saturday,AK Chaudhary,VK Sharma,S Dhawan,Delhi Capitals,Mumbai,"Wankhede Stadium, Mumbai"
Match 03 (SRH vs KKR),3,Kolkata Knight Riders,KKR,Sunrisers Hyderabad,SRH,11 April 2021,Sunday,KN Ananthapadmanabhan,Nitin Menon,N Rana,Kolkata Knight Riders,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai"
Match 04 (RR vs PBKS),4,Punjab Kings,PBKS,Rajasthan Royals,RR,12 April 2021,Monday,AK Chaudhary,S Ravi,SV Samson,Punjab Kings,Mumbai,"Wankhede Stadium, Mumbai"
Match 05 (KKR vs MI),5,Mumbai Indians,MI,Kolkata Knight Riders,KKR,13 April 2021,Tuesday,C Shamshuddin,CB Gaffaney,RD Chahar,Mumbai Indians,Chennai,"MA Chidambaram Stadium, Chepauk, Chennai"


In [8]:
all_match_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 60 entries, Match 01 (MI vs RCB) to Match 60 (CSK vs KKR)
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   match_no         60 non-null     int64 
 1   team_1           60 non-null     object
 2   team_1_sh        60 non-null     object
 3   team_2           60 non-null     object
 4   team_2_sh        60 non-null     object
 5   date             60 non-null     object
 6   day              60 non-null     object
 7   umpire_1         60 non-null     object
 8   umpire_2         60 non-null     object
 9   player_of_match  60 non-null     object
 10  winner           60 non-null     object
 11  city             60 non-null     object
 12  venue            60 non-null     object
dtypes: int64(1), object(12)
memory usage: 53.9 KB


In [9]:
all_match_df.to_csv(join(writing_path, 'Matche Details.csv'))

---