The purpose of the code in this notebook is to take the data stored in yaml files from our Kaggle Dataset, and convert it to a format appropriate for our model developement.

In [22]:
# Start by importing the necessary libraries.
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

# safe_load will allow us to parse a YAML string and convert it into a python object.
# YAML is a data serialization standard used generally in an exchange b/w diff languages.
# tqdm is used to create progress bars for loops.

We will be using data from the following Kaggle dataset: https://www.kaggle.com/datasets/veeralakrishna/cricsheet-a-retrosheet-for-cricket. In this dataset, we will look at the data of T20 matches, which has data for 1,433 matches.

In [23]:
# Extract the path of all the YAML files in the data. 
# There are 1433 YAML files with each file corresponding to a T20 match.
filenames = []
for file in os.listdir('data'):
    filenames.append(os.path.join('data', file))

In [None]:
# Transfer the contents from each YAML file to a pandas DataFrame.
main_df = pd.DataFrame()
# Declare a variable to assign a unique match ID to each file's data.
matchIdx = 1
# Iterate over all the files.
for file in tqdm(filenames):
    with open(file, 'r') as f:
        # For each file, we open it, load the contents and normalize into a DF
        # Then, we add a column with our generated match id and append to the main DataFrame.
        df = pd.json_normalize(safe_load(f))
        df['match_id'] = matchIdx
        main_df = pd.concat([main_df, df])
        matchIdx += 1

 32%|█████████████████▊                                      | 457/1433 [01:16<02:41,  6.03it/s]

In [None]:
# We create a copy of the dataframe for backup
backup = main_df.copy()

In [None]:
# Discard the data features that are not required for our model developement.
main_df.drop(columns = [
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets',
], inplace = True)

In [None]:
main_df.shape

In [None]:
main_df['info.gender'].value_counts()

In [None]:
# Filter and segregate the data pertaining to men's T20 cricket matches.
main_df = main_df.loc[main_df['info.gender'] == 'male']
# Remove gender column from data, since it is the same value for all entries.
main_df.drop(columns = ['info.gender'], inplace = True)
main_df.shape

In [None]:
# Check to ensure all the data entries pertain to T20 matches.
main_df['info.match_type'].value_counts()

In [None]:
# Check to ensure all the data entries pertain to 20 over matches.
main_df['info.overs'].value_counts()

In [None]:
# Filter the data to only include data from 20 over matches.
main_df = main_df.loc[main_df['info.overs'] == 20]
# Also, remove the columns of overs and match type since the value is the same for all entries.
main_df.drop(columns = ['info.overs','info.match_type'], inplace = True)
main_df.shape