In [None]:
#Import relevant libraries
import csv 
import pandas as pd
import numpy as np
import os
import plotly  as py
import plotly.express as px
import plotly.graph_objs as go
import kaleido

#Set folder where your code is, and where you have downloaded the raw csvs (see readme)
folder = "C:/Users/user/Documents/GitHub/Martin Shine Blog Posts/"
data = "C:/Users/user/Documents/GitHub/Martin Shine Blog Posts/raw_csv/"

In [None]:
#This will create a scorecard of each match
def extract_scorecard(file_name):
    #Make csv into dataframe
    balls = pd.read_csv(data + file_name)
    #Get just the runs scored by each batsman and then group them by striker to get runs scored by batsman per match
    runs = balls[['striker','runs_off_bat','innings']].groupby(['striker','innings']).sum().reset_index().rename(columns = {'striker':'batsman'})

    #Did they get out? and if so who and how?
    dismissals = balls[['innings','player_dismissed','bowler','wicket_type']][balls.player_dismissed.notnull()]
    scorecard = pd.merge(runs, dismissals,how='left',left_on=['batsman','innings'], right_on=['player_dismissed','innings'])
    scorecard.drop(columns=['player_dismissed'])

    #How many balls did they face?
    balls_faced = balls[['striker','runs_off_bat','innings']].groupby(['striker','innings']).count().reset_index().rename(columns = {'runs_off_bat':'balls_faced','striker':'batsman'})
    scorecard = pd.merge(scorecard, balls_faced,how='left',on=['batsman','innings'])

    #What position did they bat
    bat_order = pd.melt(balls[['innings', 'ball','striker', 'non_striker']],id_vars=["innings", "ball"]).sort_values(['innings','ball']).reset_index().drop(columns="index")
    #This step is needed for the openers, want the person facing the first ball to be batting #1
    bat_order.loc[bat_order.variable == 'non_striker', 'ball'] = bat_order['ball']+0.01
    #Get lowest ball in an innings that the batsman came in
    bat_order  = bat_order.groupby(['value','innings']).min().sort_values(['innings','ball']).reset_index().rename(columns = {'value':'batsman'})
    #Assign position
    bat_order['position'] = bat_order.groupby('innings').cumcount()+1
    scorecard = pd.merge(scorecard, bat_order,how='left',on=['batsman','innings'])

    #Get the match info and add that in too
    match_info = balls[['match_id','start_date','venue','batting_team','bowling_team']].iloc[0].to_list()
    scorecard['match_id']   = match_info[0]
    scorecard['start_date'] = match_info[1]
    scorecard['match_id'], scorecard['start_date'] , scorecard['venue'] = [match_info[0],match_info[1],match_info[2]]

    #Get who batted in what innings
    innings = balls.loc[balls.ball == 0.1][['innings','batting_team','bowling_team']]
    scorecard = pd.merge(scorecard,innings,how='left',on=['innings'])
    #Get columns in nicer order
    scorecard = scorecard[['match_id','start_date','venue','batting_team','bowling_team','innings','position','batsman', 'runs_off_bat','balls_faced', 'bowler','wicket_type']].rename(columns = {'runs_off_bat':'runs','wicket_type':'how_out'})
    #Sort rows
    scorecard = scorecard.sort_values(['innings','position'])
    return scorecard

In [None]:
#Get a list of all files where the csvs are stored (only csv files though!)
arr=os.listdir((data))
arr = [x for x in arr if ".csv" in x]
#Then import every one and make it one large dataframe
for file in arr:
    #Might be necessary if one of the imports errors out, uncomment and run again if you need to see which file is causing problems
    #print(file)
    scorecard = extract_scorecard(file)
    if file == arr[0]:
        all_scores = scorecard
    else:
        all_scores = all_scores.append(scorecard)
#Write to csv so we can just import that rather than re-do the whole processing if the notebook goes down
all_scores.to_csv(folder + "all_scores.csv",index=False)

In [None]:
#If necessary, import the csv to get the dataframe, quicker than re-creating the data from all the raw csvs
all_scores = pd.read_csv(folder + "all_scores.csv")

In [None]:
#Best to exclude "Not outs", as they're not really scores (switch on or off if change mind)
all_scores_excl = all_scores[pd.notnull(all_scores.how_out)]
#Create variable which is total innings 
total_inns = len(all_scores_excl)

def get_freq_count(df_in):
    #Get a count of each run
    df_out = df_in['runs'].value_counts()
    #Print no. of innings and sort
    print('There are ' + str(len(df_in)) + ' innings in this dataset.')
    df_out.sort_index()
    return df_out
#Set this series to variable run_dist
run_dist = get_freq_count(all_scores_excl)

In [None]:
#Plot all scores
df_run_dist = pd.DataFrame({'Runs':run_dist.index, 'Count':run_dist.values})  ## Converting series type to pandas df as plotly accepts dataframe as input. The two columns of df is FuncGroup which is being made by index of series and new variable called count which is made by values of series s.
df_run_dist['Percentage'] = (df_run_dist['Count']/total_inns)*100

#Make graph
data = [go.Bar(
x=df_run_dist['Runs'],
y=df_run_dist['Percentage'],
marker=dict(color='#EAC113')
)]
layout = go.Layout(plot_bgcolor="#383838",
                title='Frequency of each score in International Cricket',
                xaxis_title="Batsman Score",
                yaxis_title="Percentage of All Scores")
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(range=[0,11.5])
fig.update_xaxes(range=[-0.5,100.5])
fig.show()
fig.write_image(folder + "graph1.png",scale=15,engine="kaleido")

In [None]:
#Look at 100 mark
#Decided, it would be a bit more decipherable if split into 10 run buckets
bins = np.arange(0,410,10)

#Make labels
labels = []
for x in range(len(bins)-1):
    a = bins[x]
    b = bins[x]+9
    label = str(a) + "-" + str(b)
    labels.append(label)
    
#Cut into 10 run bins
df_run_dist['Run_bin'] = pd.cut(df_run_dist['Runs'],bins=bins,right=False,include_lowest=True,labels=labels)
df_run_bins = df_run_dist[['Run_bin','Percentage']].groupby(['Run_bin']).sum().reset_index()
graph_df = df_run_bins.iloc[6:15]

#Make graph
data = [go.Bar(
x=graph_df['Run_bin'],
y=graph_df['Percentage'],
marker=dict(color='#EAC113')
)]
layout = go.Layout(plot_bgcolor="#383838",
                title='Frequency of scores around the 100 runs mark',
                xaxis_title="Batsman Score (Runs in groups of 10)",
                yaxis_title="Percentage of All Scores")
fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_image(folder + "graph2.png",scale=15,engine="kaleido")

In [None]:
#Work out batsmans total runs, times out and times not out
totals = all_scores[['runs','batsman','batting_team']].groupby(['batsman','batting_team']).sum().reset_index()
times_out = all_scores[['how_out','batsman']].groupby(['batsman']).count().reset_index().rename(columns = {'how_out':'n_dismissals'})
all_not_outs = all_scores[pd.isnull(all_scores.how_out)]
all_not_outs = all_not_outs.assign(how_out="not_out")
not_outs =  all_not_outs[['how_out','batsman']].groupby(['batsman']).count().reset_index().rename(columns = {'how_out':'n_not_outs'})

#Now merge all tables as one, make NaNs as 0s
averages = pd.merge(totals,times_out,how="left",on=["batsman"])
averages =pd.merge(averages,not_outs,how="left",on=["batsman"])
averages = averages.fillna(0)

#Calculate average and number of innings
averages['n_inns'] = averages['n_dismissals']+averages['n_not_outs']
averages['average'] = averages['runs']/averages['n_dismissals']

#If average is inf set it to NaN 
averages.loc[np.isinf(averages['average']),'average'] = float('NaN')
all_not_outs = all_scores[pd.isnull(all_scores.how_out)]

#Sort and export as csv
averages = averages.sort_values(['average'],ascending=False)
averages.to_csv(folder + "averages.csv",index=False)
averages

In [None]:
#Merge with all scores
combined_scores = pd.merge(all_scores_excl,averages.rename(columns = {'runs':'total_runs'}),how='left',on=['batsman','batting_team'])

#Identify "good" batsmen, assume bad by default
combined_scores['quality'] = 'Bad'
combined_scores.loc[(combined_scores['average']>=25) & (combined_scores['total_runs']>= 1000),['quality']] = 'Good'

In [None]:
#subset good and bad scores and make counts
good_runs = combined_scores[combined_scores['quality']=='Good']
bad_runs = combined_scores[combined_scores['quality']=='Bad']

#Use function defined earlier on
good_runs_dist = get_freq_count(good_runs)
bad_runs_dist = get_freq_count(bad_runs)

#Plot all scores
def change_to_df(df_in,init_df,quality):
    df_out = pd.DataFrame({'Runs':df_in.index, 'Count':df_in.values})  ## Converting series type to pandas df as plotly accepts dataframe as input. The two columns of df is FuncGroup which is being made by index of series and new variable called count which is made by values of series s.
    df_out['Percentage'] = (df_out['Count']/len(init_df))*100
    df_out['quality'] = quality
    df_out = df_out.sort_values(['Runs'])
    return df_out
 
#Make into df and append together
good_df = change_to_df(good_runs_dist,good_runs,"Good")
bad_df = change_to_df(bad_runs_dist,bad_runs,"Bad")

#Make graph
# Create traces
trace0 = go.Scatter(
    x=good_df['Runs'],
    y=good_df['Percentage'],
    mode = 'lines',
    name = '"Good" Batsmen',
    line=dict(color='#EAC113')
)
trace1 = go.Scatter(
    x=bad_df['Runs'],
    y=bad_df['Percentage'],
    mode = 'lines',
    name = '"Bad" Batsmen',
    line=dict(color='red')
)
data = [trace0, trace1] # assign traces to data
layout = go.Layout(plot_bgcolor="#383838",
                title='Comparing score distribution of "good" batsmen to "bad" batsmen',
                xaxis_title="Batsman Score",
                yaxis_title="Percentage of All Scores")
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(range=[0,16.5])
fig.update_xaxes(range=[-0.5,50.5])
fig.show()
fig.write_image(folder + "graph3.png",scale=15,engine="kaleido")