Identified dataset : cricket data set

In [2]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import math



In [3]:
%matplotlib qt 

The cricket dataset has many .csv files.
We will be exploring those files using pandas library to read and parse csv files


In [4]:
# loading dataframes into a dictonary
def load_dataframes(dirPath):
    df_list = { } # a dictionary

    for i in os.listdir(dirPath):
        if os.path.splitext(i)[1] == ".csv": # read only csv files from the dataset
            df = pd.read_csv(f"../cricket_dataset/{i}", delimiter=',')
            df_list[i] = df

    return df_list

df_list = load_dataframes("../cricket_dataset")


In [5]:
# columns in all the dataset
def data_in_dataset(df_list):
    for key in df_list:
        # tmp = df_list[key].columns
        # print(f"\nThe columns in {key} are = \n", tmp)

        dataType = df_list[key].dtypes

        print(f"\nThe data type of the columns of '{key}' is")
        print(dataType)

data_in_dataset(df_list)
    


The data type of the columns of 'odb.csv' is
SN                 int64
Player            object
Span              object
Matches            int64
Innings            int64
Not Outs           int64
Runs               int64
High Score        object
Average Score    float64
Balls Faced        int64
Strike Rate      float64
100                int64
50                 int64
0                  int64
4s                object
6s                object
dtype: object

The data type of the columns of 'odbo.csv' is
Unnamed: 0      int64
Player         object
Span           object
Mat             int64
Inns            int64
Balls           int64
Runs            int64
Wkts            int64
BBI            object
Ave           float64
Econ          float64
SR            float64
4               int64
5               int64
dtype: object

The data type of the columns of 'odt.csv' is
Unnamed: 0             int64
Series/Tournament     object
Season                object
Unnamed: 2           float64
Winner   

In [6]:
# comparision task 1
# use a bar chart to compare average score of the first 10 batsman in odb.csv
def barChartComparingAvgScore(df_list):
    n = 10
    df = df_list.get("odb.csv") # get the dataframe

    playerNames = []
    avgScore = []

    def valueOfBar(ax, arr):
        for i, v in enumerate(arr):
            ax.text(v + 3, i, str(v), color = 'blue', fontweight='bold')

    for i in range(n): 
        playerNames.append(df.iloc[i]["Player"])
        avgScore.append(df.iloc[i]["Average Score"])

    print(playerNames)
    print(avgScore)

    fig, ax = plt.subplots()

    ax.barh(playerNames, avgScore, align='center')

    valueOfBar(ax, avgScore)

    plt.xlabel("Name of player", fontsize=14)
    plt.ylabel("Average score of player", fontsize=14)
    plt.title("Comparision of average score of first 10 batsman", fontsize=18, loc='left')
    plt.show()

barChartComparingAvgScore(df_list)



['SR Tendulkar (INDIA)', 'V Kohli (INDIA)', 'RT Ponting (AUS/ICC)', 'RG Sharma (INDIA)', 'ST Jayasuriya (Asia/SL)', 'HM Amla (SA)', 'AB de Villiers (Afr/SA)', 'CH Gayle (ICC/WI)', 'KC Sangakkara (Asia/ICC/SL)', 'SC Ganguly (Asia/INDIA)']
[44.83, 59.07, 42.03, 48.96, 32.36, 49.46, 53.5, 37.83, 41.98, 41.02]


In [7]:
# composition task 1
# pie chart of total number of run of teams from different countries / areas

# here the country is given in the name of player in brackets
# so we can use regex to match the first string inside the bracket


def removeBracketAndSlashes(str):
    start, end = 0, len(str)

    if(str[0] == '('):
        start = 1
    
    if(str[-1] == '/'):
        end -= 1
    
    return str[start : end]

def getCountryNames(df):
    data = {} # dictonary
    total_runs = 0
    regstr = r"(\([a-zA-Z]+\/?)"

    for i in range(len(df)):
        m = re.search(regstr, df.iloc[i]["Player"])
        countryName = removeBracketAndSlashes(m.group(0) )

        if(countryName in data):
            total_runs += df.iloc[i]["Runs"]
            data[countryName] += df.iloc[i]["Runs"]
        else:
            total_runs += df.iloc[i]["Runs"]
            data[countryName] = df.iloc[i]["Runs"]     

    return total_runs, data

def getExplodeRadius(runsByarea, total_runs):
    explode_rad = []

    for key, val in runsByarea.items():
        tmp = (val * 100) / total_runs

        if (tmp < 1):
            explode_rad.append(.5)
        else:
            explode_rad.append(0)
    return explode_rad 

def PieChartOfRunByAreas(df_list):
    total_runs, runsByArea = getCountryNames(df_list.get("odb.csv"))
    explode_rad = getExplodeRadius(runsByArea, total_runs)

    print(total_runs, runsByArea)

    fig, ax = plt.subplots()

    ax.pie(runsByArea.values(),labels=runsByArea.keys(), explode = explode_rad,
            autopct= lambda p : "{:.2f}% ({:d})".format(p, int(p * sum(runsByArea.values()) // 100) ) )

    plt.title("Pie Char of number of run of teams from different countries / areas", fontsize=18)
    plt.show()
    

PieChartOfRunByAreas(df_list)


750520 {'INDIA': 79755, 'AUS': 103267, 'Asia': 126787, 'SA': 37234, 'Afr': 28145, 'ICC': 28922, 'SL': 31174, 'NZ': 37678, 'PAK': 81428, 'WI': 64301, 'ENG': 66098, 'BAN': 20847, 'IRE': 11761, 'ZIM': 24098, 'SCOT': 2290, 'AFG': 5194, 'NED': 1541}


In [8]:
# comparative task 2
# radar chart of balls faced, Strike rate, 100s, 50s, 4s, 6s, and Average score of random batsmen
import random as rd

sz = 5

def alNumToAlpha(str):
    num = ""

    if not str.isdigit():
        for i in str:
            if i.isdigit():
                num = num + i
    
    return num


# have to parse as 4s and 6s are not int64 objects so we have to
# parse them 
def parse4and6(df):

    for i in range(len(df)):
        df.iloc[i]["4s"] = alNumToAlpha(df.iloc[i]["4s"])
        df.iloc[i]["6s"] = alNumToAlpha(df.iloc[i]["6s"])
        df["4s"] = df["4s"].astype(int)
        

df = df_list.get("odbo.csv")
year_cnt = {}

def get_data(df):
    indexes = rd.sample( range(len(df)), sz)
    return df.iloc[indexes][["Player", "100", "50", "4s", "6s"]]

data = get_data(df_list["odb.csv"])
parse4and6(data)
print(data)
# ans = alNumToAlpha("2001+3")
# print(ans)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


ValueError: invalid literal for int() with base 10: '407+'