In [28]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
import glob

EPSILON = 1e-10

In [29]:
def get_iso_data(iso_data_path):
	csv_files = glob.glob(f'{iso_data_path}/*.csv')
	filtered_csv_files = [file for file in csv_files if not file.endswith('_info.csv')]
	#print(filtered_csv_files[0:5])
	df = pd.concat([pd.read_csv(file) for file in filtered_csv_files], ignore_index = True)
	# first innings data
	df = df[df['innings']==1]
	df.to_csv('../data/combined_output.csv', index = False)
	#print(df[0:5])
	df = df[['match_id', 'start_date', 'innings', 'ball', 'runs_off_bat', 'extras', 'wicket_type']]
	df['wickets_indicator'] = df['wicket_type'].notna().astype(int)
	df['balls_remaining'] = 300 - ((df['ball'].astype(int) * 6) + ((df['ball'] * 10) % 10).astype(int))
		
	# Calculate wickets remaining for each match and innings
	df['wickets_fallen'] = df.groupby(['match_id', 'innings'])['wickets_indicator'].cumsum()
	df['wickets_remaining'] = 10 - df['wickets_fallen']

	#print(df[270:300])
	return df

'''
iso_df = get_iso_data('../data/recently_added_2_male_csv2')
sub_df = iso_df[['match_id', 'start_date', 'innings', 'balls_remaining', 'wickets_remaining', 'wickets_indicator']]
sub_df.head()
'''
df = get_iso_data('../data/recently_added_2_male_csv2')
print("Columns in DataFrame:", df.columns)


Columns in DataFrame: Index(['match_id', 'start_date', 'innings', 'ball', 'runs_off_bat', 'extras',
       'wicket_type', 'wickets_indicator', 'balls_remaining', 'wickets_fallen',
       'wickets_remaining'],
      dtype='object')


# The Components of the distribution function

## Probability of extras

In [30]:
def get_prob_of_wide_or_no_ball(iso_df):
	extras_runs = iso_df['extras'].sum()
	total_no_of_balls = iso_df['ball'].count()
	prob_of_wide_or_no_ball = extras_runs/(total_no_of_balls+extras_runs)
	print(f'extras_runss: {extras_runs}, total_no_of_balls:{total_no_of_balls}')
	return prob_of_wide_or_no_ball

## Wicket Process

In [31]:
class WicketProbit:
    def __init__(self, df):
        self.df = df
        #Initial guesses
        self.a0 = 1.67
        self.a1 = 0.00758
        self.a2 = -0.0459
        self.a3 = - 0.0000160
    
    def LLF(self, params):
        a0, a1, a2, a3 = params
        sum = 0
        for index, row in self.df.iterrows():
            x = -a0 - a1*row['balls_remaining'] - a2*row['wickets_remaining'] - a3*(row['balls_remaining']**2)
            sum += row['wickets_indicator']*np.log(norm.cdf(x) + EPSILON) + (1-row['wickets_indicator'])*np.log(1-norm.cdf(x) + EPSILON)
        
        return -sum
    
    def fit(self):
        res = minimize(self.LLF, [self.a0, self.a1, self.a2, self.a3])
        self.a0, self.a1, self.a2, self.a3 = res.x
        return None
    
    def predict(self, balls_remaining, wickets_remaining):
        x = -self.a0 - self.a1*balls_remaining - self.a2*wickets_remaining + self.a3*(balls_remaining**2)
        return norm.cdf(x)

## Runs Process

In [None]:
class RunsOProbit:
    def __init__(self, df):
        self.df = df
        #Initial guesses
        self.a0 = -0.174
        self.a1 = -0.00844
        self.a2 = 0.130
        self.a3 = 0.0000106
        #Ordered thresholds
        self.mu0 = 0 #Symbolically defined, we set to 0 wlg
        self.mu1 = 0.940
        self.mu2 = 1.263
        self.mu3 = 1.325
        self.mu4 = 2.321
        self.mu5 = 2.328
        
    def LLF(self, params):
        a0, a1, a2, a3, mu1, mu2, mu3, mu4, mu5 = params
        mu0 = self.mu0
        sum = 0
        for index, row in self.df.iterrows():
            if row['wickets_indicator'] == 1:
                continue
            x = a0 + a1*row['balls_remaining'] + a2*row['wickets_remaining'] + a3*(row['balls_remaining']**2)
            if row['runs_off_bat'] == 0:
                sum += np.log(norm.cdf((mu0-x))+ EPSILON)
            elif row['runs_off_bat'] == 1:
                sum += np.log(norm.cdf((mu1-x)) - norm.cdf((mu0-x))+ EPSILON)
            elif row['runs_off_bat'] == 2:
                sum += np.log(norm.cdf((mu2-x)) - norm.cdf((mu1-x))+ EPSILON)
            elif row['runs_off_bat'] == 3:
                sum += np.log(norm.cdf((mu3-x)) - norm.cdf((mu2-x))+ EPSILON)
            elif row['runs_off_bat'] == 4:
                sum += np.log(norm.cdf((mu4-x)) - norm.cdf((mu3-x))+ EPSILON)
            elif row['runs_off_bat'] == 5:
                sum += np.log(norm.cdf((mu5-x)) - norm.cdf((mu4-x))+ EPSILON)
            else:
                sum += np.log(1-norm.cdf((mu5-x))+ EPSILON)
        
        return -sum
    
    def fit(self):
        res = minimize(self.LLF, [self.a0, self.a1, self.a2, self.a3, self.mu1, self.mu2, self.mu3, self.mu4, self.mu5])
        self.a0, self.a1, self.a2, self.a3, self.mu1, self.mu2, self.mu3, self.mu4, self.mu5 = res.x
        return None
    
    def predict(self, runs, balls_remaining, wickets_remaining):
        x = self.a0 + self.a1*balls_remaining + self.a2*wickets_remaining + self.a3*(balls_remaining**2)
        if runs == 0:
            return norm.cdf(self.mu0-x)
        elif runs == 1:
            return norm.cdf(self.mu1-x) - norm.cdf(self.mu0-x)
        elif runs == 2:
            return norm.cdf(self.mu2-x) - norm.cdf(self.mu1-x)
        elif runs == 3:
            return norm.cdf(self.mu3-x) - norm.cdf(self.mu2-x)
        elif runs == 4:
            return norm.cdf(self.mu4-x) - norm.cdf(self.mu3-x)
        elif runs == 5:
            return norm.cdf(self.mu5-x) - norm.cdf(self.mu4-x)
        else:
            return 1-norm.cdf(self.mu5-x)

# Constructing the distribution function

In [33]:
def construct_F(max_runs, max_balls, max_wickets, wicket_model, runs_model, p_extra):
    F = np.zeros((max_runs+1, max_balls+1, max_wickets+1))
    
    #Boundary conditions
    F[:,0,:] = 1 #No balls left, so win is impossible
    F[:,:,0] = 1 #No wickets left, so win is impossible
    
    #Filling in the rest of the table using the recursion
    for b in range(1, max_balls+1):
        for w in range(1, max_wickets+1):
            for r in range(max_runs+1):
                term1 = p_extra*F[r-1,b,w] if r>0 else 0 #Extra ball
                term2 = (1-p_extra)*wicket_model.predict(b, w)*F[r,b-1,w-1] #Wicket
                #term3 = (1-p_extra)*(1-wicket_model.predict(b, w))*sum([runs_model.predict(i, b, w)*F[r-i,b-1,w] for i in range(7)]) #Runs
                
                #To accomodate for the cases when the runs scored can be greater than the remaining runs
                term3 = 0
                for i in range(7):
                    if r >= i:
                        term3 += (1-p_extra)*(1-wicket_model.predict(b, w))*runs_model.predict(i, b, w)*F[r-i,b-1,w] #Runs
                
                F[r,b,w] = term1 + term2 + term3
    
    return F

# *sighs* Training 

In [34]:
def training(data_path):
    #import data
    dataframe = get_iso_data(data_path)
    
    #Setup the three processes
    px = get_prob_of_wide_or_no_ball(dataframe) #Probability of wide or no ball
    
    wicket_model = WicketProbit(dataframe) #Wicket model
    print('fitting...')
    wicket_model.fit()
    print("Wicket process fit!")

    runs_model = RunsOProbit(dataframe) #Runs model
    print('fitting...')
    runs_model.fit()
    print("Runs process fit!")
    
    #Construct the F table
    max_runs = 500
    max_balls = 300
    max_wickets = 10
    
    F = construct_F(max_runs, max_balls, max_wickets, wicket_model, runs_model, px)
    
    return F, wicket_model, runs_model, px

In [35]:
data_path = "../data/recently_added_2_male_csv2"
F, wicket_model, runs_model, px = training(data_path)

extras_runss: 171, total_no_of_balls:3899
fitting...
Wicket process fit!
fitting...


  sum += np.log(norm.cdf((mu2-x)+ EPSILON) - norm.cdf((mu1-x))+ EPSILON)
  sum += np.log(norm.cdf((mu3-x)+ EPSILON) - norm.cdf((mu2-x))+ EPSILON)
  sum += np.log(norm.cdf((mu5-x)+ EPSILON) - norm.cdf((mu4-x))+ EPSILON)


KeyboardInterrupt: 