In [None]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
import glob

In [5]:
def get_iso_data(iso_data_path):
	csv_files = glob.glob(f'{iso_data_path}/*.csv')
	filtered_csv_files = [file for file in csv_files if not file.endswith('_info.csv')]
	print(filtered_csv_files[0:5])
	df = pd.concat([pd.read_csv(file) for file in filtered_csv_files], ignore_index = True)
	# first innings data
	df = df[df['innings']==1]
	df.to_csv('../data/combined_output.csv', index = False)
	print(df[0:5])
	df = df[['match_id', 'start_date', 'innings', 'ball', 'runs_off_bat', 'extras', 'wicket_type']]
	df['wickets_indicator'] = df['wicket_type'].notna().astype(int)
	df['balls_remaining'] = 300 - ((df['ball'].astype(int) * 6) + ((df['ball'] * 10) % 10).astype(int))
		
	# Calculate wickets remaining for each match and innings
	df['wickets_fallen'] = df.groupby(['match_id', 'innings'])['wickets_indicator'].cumsum()
	df['wickets_remaining'] = 10 - df['wickets_fallen']

	#print(df[270:300])
	return df

iso_df = get_iso_data('../data/recently_added_2_male_csv2')
sub_df = iso_df[['match_id', 'start_date', 'innings', 'balls_remaining', 'wickets_remaining', 'wickets_indicator']]
sub_df.head()

['../data/recently_added_2_male_csv2\\1433370.csv', '../data/recently_added_2_male_csv2\\1433371.csv', '../data/recently_added_2_male_csv2\\1439898.csv', '../data/recently_added_2_male_csv2\\1444772.csv', '../data/recently_added_2_male_csv2\\1444793.csv']
   match_id   season  start_date  \
0   1433370  2024/25  2024-10-31   
1   1433370  2024/25  2024-10-31   
2   1433370  2024/25  2024-10-31   
3   1433370  2024/25  2024-10-31   
4   1433370  2024/25  2024-10-31   

                                               venue  innings  ball  \
0  Sir Vivian Richards Stadium, North Sound, Antigua        1   0.1   
1  Sir Vivian Richards Stadium, North Sound, Antigua        1   0.2   
2  Sir Vivian Richards Stadium, North Sound, Antigua        1   0.3   
3  Sir Vivian Richards Stadium, North Sound, Antigua        1   0.4   
4  Sir Vivian Richards Stadium, North Sound, Antigua        1   0.5   

  batting_team bowling_team  striker non_striker  ... extras  wides  noballs  \
0      England  West

Unnamed: 0,match_id,start_date,innings,balls_remaining,wickets_remaining,wickets_indicator
0,1433370,2024-10-31,1,299,10,0
1,1433370,2024-10-31,1,298,10,0
2,1433370,2024-10-31,1,297,10,0
3,1433370,2024-10-31,1,296,10,0
4,1433370,2024-10-31,1,295,10,0


# The Components of the distribution function

## Probability of extras

In [4]:
def get_prob_of_wide_or_no_ball(iso_df):
	extras_runs = iso_df['extras'].sum()
	total_no_of_balls = iso_df['ball'].count()
	prob_of_wide_or_no_ball = extras_runs/(total_no_of_balls+extras_runs)
	print(f'extras_runss: {extras_runs}, total_no_of_balls:{total_no_of_balls}')
	return prob_of_wide_or_no_ball

## Wicket Process

In [None]:
class WicketProbit:
    def __init__(self, df):
        self.df = df
        #Initial guesses
        self.a0 = 1.67
        self.a1 = 0.00758
        self.a2 = -0.0459
        self.a3 = - 0.0000160
    
    def LLF(self, params):
        a0, a1, a2, a3 = params
        sum = 0
        for index, row in self.df.iterrows():
            x = -a0 - a1*row['balls_remaining'] - a2*row['wickets_remaining'] + a3*(row['balls_remaining']**2)
            sum += row['wickets_indicator']*np.log(norm.cdf(x)) + (1-row['wickets_indicator'])*np.log(1-norm.cdf(x))
        
        return -sum
    
    def fit(self):
        res = minimize(self.LLF, [self.a0, self.a1, self.a2, self.a3])
        self.a0, self.a1, self.a2, self.a3 = res.x
        return None
    
    def predict(self, balls_remaining, wickets_remaining):
        x = -self.a0 - self.a1*balls_remaining - self.a2*wickets_remaining + self.a3*(balls_remaining**2)
        return norm.cdf(x)

## Runs Process

In [None]:
class RunsOProbit: