In [58]:
import numpy as np
import pandas as pd
from yaml import safe_load # to convert data from yaml to pandas # safe load limits the function to create simple python object to avoid any malicious data
import os
from tqdm import tqdm # to show progress bar

In [179]:
filenames = []
for file in os.listdir('data'):
    filenames.append(os.path.join('data',file))

In [180]:
filenames[0:5]

['data\\1001349.yaml',
 'data\\1001351.yaml',
 'data\\1001353.yaml',
 'data\\1004729.yaml',
 'data\\1007655.yaml']

In [181]:
final_df = pd.DataFrame()
row_no = 1
for file in tqdm(filenames):
    with open(file,'r') as f:
        df = pd.json_normalize(safe_load(f))
        df['match_id'] = row_no
        final_df = final_df.append(df)
        row_no += 1
        

100%|██████████████████████████████████████████████████████████████████████████████| 1432/1432 [08:33<00:00,  2.79it/s]


In [182]:
final_df.head(5)

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,info.overs,...,info.outcome.by.runs,info.match_type_number,info.neutral_venue,info.outcome.method,info.outcome.result,info.outcome.eliminator,info.supersubs.New Zealand,info.supersubs.South Africa,info.bowl_out,info.outcome.bowl_out
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-18,2,[2017-02-17],male,T20,5.0,Sri Lanka,20,...,,,,,,,,,,
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-19,2,[2017-02-19],male,T20,2.0,Sri Lanka,20,...,,,,,,,,,,
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-23,1,[2017-02-22],male,T20,,Australia,20,...,41.0,,,,,,,,,
0,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.9,2016-09-12,1,[2016-09-05],male,T20,,Hong Kong,20,...,40.0,,,,,,,,,
0,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.9,2016-06-19,1,[2016-06-18],male,T20,,Zimbabwe,20,...,2.0,,,,,,,,,


In [183]:
# removing unnecessary columns

final_df.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets'
],inplace=True)

In [184]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1432 entries, 0 to 0
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   innings               1432 non-null   object
 1   info.dates            1432 non-null   object
 2   info.gender           1432 non-null   object
 3   info.match_type       1432 non-null   object
 4   info.outcome.winner   1386 non-null   object
 5   info.overs            1432 non-null   int64 
 6   info.player_of_match  1269 non-null   object
 7   info.teams            1432 non-null   object
 8   info.toss.decision    1432 non-null   object
 9   info.toss.winner      1432 non-null   object
 10  info.umpires          1410 non-null   object
 11  info.venue            1432 non-null   object
 12  match_id              1432 non-null   int64 
 13  info.city             1240 non-null   object
dtypes: int64(2), object(12)
memory usage: 167.8+ KB


In [185]:
final_df['info.gender'].value_counts()

male      966
female    466
Name: info.gender, dtype: int64

In [186]:
# dropping female mathces
final_df = final_df[final_df['info.gender']=='male']
final_df.drop(columns=['info.gender'],inplace=True)

In [187]:
final_df['info.match_type'].value_counts()

T20    966
Name: info.match_type, dtype: int64

In [188]:
# Since all are T20 matches so dropping column match_type
final_df.drop(columns=['info.match_type'],inplace=True)

In [189]:
final_df['info.overs'].value_counts()

20    963
50      3
Name: info.overs, dtype: int64

In [190]:
# removing 50 overs and dropping column 
final_df = final_df[final_df['info.overs']==20]
final_df.drop(columns=['info.overs'],inplace=True)

In [193]:
backup1 = final_df

In [196]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 963 entries, 0 to 0
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   innings               963 non-null    object
 1   info.dates            963 non-null    object
 2   info.outcome.winner   927 non-null    object
 3   info.player_of_match  865 non-null    object
 4   info.teams            963 non-null    object
 5   info.toss.decision    963 non-null    object
 6   info.toss.winner      963 non-null    object
 7   info.umpires          953 non-null    object
 8   info.venue            963 non-null    object
 9   match_id              963 non-null    int64 
 10  info.city             825 non-null    object
dtypes: int64(1), object(10)
memory usage: 90.3+ KB


In [197]:
# taking backup
backup = final_df

In [240]:
final_df = backup

In [241]:
# fetching match id with incomplete 1st innings
incomplete_list = []
for index,row in final_df.iterrows():
    try:
        row['innings'][1]['2nd innings']
    except:
        incomplete_list.append(row['match_id'])

In [243]:
# dropping row with incomplete 1st inning 
final_df['temp'] = final_df['match_id'].isin(incomplete_list)

In [244]:
final_df['temp'].value_counts()

False    952
True      11
Name: temp, dtype: int64

In [247]:
final_df = final_df[final_df['temp']==False]

In [248]:
final_df.drop(columns=['temp'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### we have data for both the innings so we need to extract data only for 1st inning
### Data for first innings 
{0.1: {'non_striker': 'BA King',
   'bowler': 'SL Malinga',
   'runs': {'extras': 0, 'total': 0, 'batsman': 0},
   'batsman': 'LMP Simmons'}

In [299]:
count = 0
delivery_df = pd.DataFrame()

for index, row in final_df.iterrows():
    
    count+=1
    
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []
    
    
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except:
                player_of_dismissed.append('0')
                
    loop_df = pd.DataFrame({
            'match_id':match_id,
            'teams':teams,
            'batting_team':batting_team,
            'ball':ball_of_match,
            'batsman':batsman,
            'bowler':bowler,
            'runs':runs,
            'player_dismissed':player_of_dismissed,
            'city':city,
            'venue':venue
        })
    delivery_df = delivery_df.append(loop_df)

In [300]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,1,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
1,1,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
2,1,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground
3,1,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground
4,1,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...,...,...
121,952,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium
122,952,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium
123,952,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium
124,952,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium


In [301]:
# adding bowling team column and dropping teams
delivery_df['bowling_team']=delivery_df['teams'].apply(lambda x:x[1])

In [302]:
delivery_df.drop(columns=['teams'],inplace=True)

In [303]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...
121,952,Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
122,952,Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
123,952,Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
124,952,Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [304]:
# taking only top 12 teams 
df=pd.DataFrame(delivery_df['batting_team'].value_counts()[:12])
team_list = df.index.tolist()

In [305]:
team_list

['Pakistan',
 'South Africa',
 'India',
 'New Zealand',
 'Sri Lanka',
 'West Indies',
 'England',
 'Australia',
 'Afghanistan',
 'Bangladesh',
 'Ireland',
 'Zimbabwe']

In [307]:
delivery_df = delivery_df[delivery_df['batting_team'].isin(team_list) & delivery_df['bowling_team'].isin(team_list)]

In [308]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,1,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,1,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,1,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,1,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,1,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...
121,952,Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
122,952,Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
123,952,Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
124,952,Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [309]:
output = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [311]:
import pickle

pickle.dump(output,open('final_data.pkl','wb'))