In [1]:
import pandas as pd
import numpy as np
import warnings
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from scipy.stats import linregress
import pickle
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 1500)
pd.set_option("display.max_rows", 1500)

# DATA CLEANING AND REVIEW

In [2]:
bbb_espn = pd.read_csv("T20_BBB_adv_merged_since_2003.csv")
print(bbb_espn.shape)

bbb_espn[['game_no', 'inns_no', 'over', 'ball']] = bbb_espn[['game_no', 'inns_no', 'over', 'ball']].astype(int)
bbb_espn['start_date'] = pd.to_datetime(bbb_espn['start_date'])
bbb_espn = bbb_espn.drop_duplicates(subset='ball_id', keep='first')
bbb_espn = bbb_espn.sort_values(['game_no', 'inns_no', 'over', 'ball'])
bbb_espn.loc[bbb_espn.dismissalText_long.notna(), 'wicket_flag'] = 1
bbb_espn.loc[bbb_espn.dismissalText_long.isna(), 'wicket_flag'] = 0
bbb_espn['wicket_flag'] = bbb_espn['wicket_flag'].astype(int)
bbb_espn["totalInningWickets1"] = bbb_espn.groupby("inns_id").wicket_flag.cumsum()

bbb_espn['inns_ball_count'] = bbb_espn.groupby(['game_no', 'inns_no']).cumcount()+1
bbb_espn['inns_runs_cumsum'] = bbb_espn.groupby(['game_no', 'inns_no', 'bat_name']).totalRuns.cumsum()
bbb_espn['inns_bat_ball_count'] = bbb_espn.groupby(['game_no', 'inns_no', 'bat_name']).cumcount()+1
bbb_espn['inns_bat_runs_cumsum'] = bbb_espn.groupby(['game_no', 'inns_no', 'bat_name']).batsmanRuns.cumsum()
bbb_espn['runs_req'] = bbb_espn['target']-bbb_espn['inns_runs_cumsum']
bbb_espn['inns_actual_balls'] = (bbb_espn['oversActual'].astype(str).str.split(".", expand=True).astype(int)[0]*6)+(bbb_espn['oversActual'].astype(str).str.split(".", expand=True).astype(int)[1])
bbb_espn['inns_balls_remain'] = 120-bbb_espn['inns_actual_balls']
bbb_espn['inns_RR'] = (bbb_espn['totalInningRuns']/bbb_espn['inns_actual_balls'])*6
bbb_espn['inns_RRR'] = (bbb_espn['runs_req']/bbb_espn['inns_balls_remain'])*6
bbb_espn['wkt_rem'] = 10-bbb_espn['totalInningWickets1']
print(bbb_espn.shape)

bbb_espn.loc[bbb_espn.bowl_type.isin(['right-arm fast-medium', 'right-arm fast', 'right-arm medium', 
                                            'right-arm medium-fast', 'right-arm slow-medium', 'right-arm bowler']), 'bowl_type1'] = "RA_SEAM"
bbb_espn.loc[bbb_espn.bowl_type.isin(['slow left-arm orthodox', 'left-arm slow']), 'bowl_type1'] = "LA_OB"
bbb_espn.loc[bbb_espn.bowl_type.isin(['right-arm offbreak', ]), 'bowl_type1'] = "RA_OB"
bbb_espn.loc[bbb_espn.bowl_type.isin(['left-arm fast-medium', 'left-arm medium', 'left-arm medium-fast', 
                                            'left-arm fast', 'left-arm slow-medium']), 'bowl_type1'] = 'LA_SEAM'
bbb_espn.loc[bbb_espn.bowl_type.isin(['left-arm wrist-spin',]), 'bowl_type1'] = "LA_LB"
bbb_espn.loc[bbb_espn.bowl_type.isin(['legbreak', 'legbreak googly']), 'bowl_type1'] = "RA_LB"

bbb_espn.loc[bbb_espn.bowl_type1.isin(['LA_OB', 'RA_OB', 'LA_LB', 'RA_LB']), 'bowl_type2'] = 'spin'
bbb_espn.loc[bbb_espn.bowl_type1.isin(['LA_SEAM', 'RA_SEAM']), 'bowl_type2'] = 'seam'
bbb_espn.loc[bbb_espn.bowl_type2.isna(), 'bowl_type2'] = '-'

bbb_espn['year'] = pd.to_datetime(bbb_espn['start_date']).dt.year
bbb_espn = bbb_espn.query("year>=2012")
bbb_espn.loc[bbb_espn.year.between(2012,2015, inclusive='both'), 'year_phase'] = "P1"
bbb_espn.loc[bbb_espn.year.between(2016,2020, inclusive='both'), 'year_phase'] = "P2"
bbb_espn.loc[bbb_espn.year>2020, 'year_phase'] = "P3"

bbb_espn.loc[(bbb_espn.bat_type=='right-hand bat') & (bbb_espn.bowl_type1.isin(['RA_OB', 'LA_LB'])), 'match_up_flag'] = 'no'
bbb_espn.loc[(bbb_espn.bat_type=='right-hand bat') & (bbb_espn.bowl_type1.isin(['RA_LB', 'LA_OB'])), 'match_up_flag'] = 'yes'
bbb_espn.loc[(bbb_espn.bat_type=='left-hand bat') & (bbb_espn.bowl_type1.isin(['RA_OB', 'LA_LB'])), 'match_up_flag'] = 'yes'
bbb_espn.loc[(bbb_espn.bat_type=='left-hand bat') & (bbb_espn.bowl_type1.isin(['RA_LB', 'LA_OB'])), 'match_up_flag'] = 'no'
print(bbb_espn.shape)

# espn_sc_t20bat = pd.read_csv(r"C:\Users\ashis\Documents\Cricket\Completed Analysis\CSVs\ESPN_T20_basic_batting_scorecard_since_2003.csv")
# espn_sc_t20bat[['game_id_x', 'inningNumber']] = espn_sc_t20bat[['game_id_x', 'inningNumber']].astype(str)

# espn_sc_t20bat = espn_sc_t20bat[['game_id_x', 'inningNumber', 'team', 'T1', 'T2', 'winner', 'margin', 'start_date', 'runs_x', 
#                                  'wickets', 'lead', 'target', 'overs', 'balls_x', 'extras', 'byes', 'legbyes', 'wides', 'noballs', 
#                                  'penalties', 'venue_name', 'game_type_1', 'competition_shortName']].drop_duplicates().\
#                                      rename({"game_id_x":"game_no", "inningNumber":"inns_no"}, axis=1)
                                     
# bbb_merged1 = pd.merge(espn_sc_t20bat, bbb_espn, on=['game_no', 'inns_no'])

bbb_merged1 = bbb_espn.query("shotType.notna()")
# bbb_merged1[['game_no', 'inns_no', 'over', 'ball']] = bbb_merged1[['game_no', 'inns_no', 'over', 'ball']].astype(int)
# bbb_merged1['start_date'] = pd.to_datetime(bbb_merged1['start_date'])
# bbb_merged1 = bbb_merged1.sort_values(['start_date', 'game_no', 'inns_no', 'over', 'ball'])
bbb_merged1 = bbb_merged1.query("start_date<'2024-02-16 00:00:00+00:00' and bowl_type.notna()")

bbb_merged1 = bbb_merged1.query("bowl_type1.isin(['LA_OB', 'RA_OB', 'LA_LB', 'RA_LB'])")
print(bbb_merged1.shape)

bat_req = pd.DataFrame(pd.DataFrame(bbb_merged1.groupby('bat_name').match_up_flag.value_counts()).rename({"match_up_flag":"count"}, axis=1)\
          .reset_index().query("count>=180").bat_name.value_counts()).query("bat_name==2").index.to_list()
bbb_merged1 = bbb_merged1.query("bat_name.isin(@bat_req)")
print(bbb_merged1.shape)

(2272560, 62)
(2272488, 74)
(2019416, 79)
(371834, 79)
(218792, 79)


In [3]:
bbb_espn.head(5)

Unnamed: 0,over,ball,dismissalType,oversUnique,oversActual,totalRuns,batsmanRuns,isFour,isSix,isWicket,byes,legbyes,wides,noballs,penalties,wagonX,wagonY,wagonZone,pitchLine,pitchLength,shotType,batsmanPlayerId,bowlerPlayerId,outPlayerId,totalInningRuns,totalInningWickets,title,inns_id,ball_id,dismissalText_long,dismissalText_short,bat_team,ball_commentary,game_no,inns_no,bat_name,bat_type,team,runs_inns,wickets_inns,target,overs_inns,balls_inns,extras_inns,byes_inns,legbyes_inns,wides_inns,noballs_inns,start_date,venue_name,game_type_1,competition_shortName,T1,T2,venue,competition_fullName,winner,margin,bat_pos,bowl_name,bowl_type,bowl_pos,wicket_flag,totalInningWickets1,inns_ball_count,inns_runs_cumsum,inns_bat_ball_count,inns_bat_runs_cumsum,runs_req,inns_actual_balls,inns_balls_remain,inns_RR,inns_RRR,wkt_rem,bowl_type1,bowl_type2,year,year_phase,match_up_flag
378414,1,1,,0.01,0.1,0,0,False,False,False,0,0,0,0,0,210,195,3,,,FORWARD_DEFENCE,48739,12894,,0,0,Ashwin to Warner,518954_1,518954_1_1_1,,,,"Ashwin gets good drift from round the wicket, ...",518954,1,David Warner,left-hand bat,Australia,171.0,4.0,0.0,20.0,120.0,3.0,0.0,1.0,2.0,0.0,2012-02-01 00:00:00+00:00,"Stadium Australia, Sydney",night,India tour of Australia,Australia,India,Sydney,india-tour-of-australia-2011-12,Australia,31 runs,1.0,Ravichandran Ashwin,right-arm offbreak,1.0,0,0,1,0.0,1.0,0.0,0.0,1,119,0.0,0.0,10,RA_OB,spin,2012.0,P1,yes
378413,1,2,,0.02,0.2,0,0,False,False,False,0,0,0,0,0,108,153,7,,,LEG_GLANCE,48739,12894,,0,0,Ashwin to Warner,518954_1,518954_1_1_2,,,,little bit of turn and he tucks it to the leg ...,518954,1,David Warner,left-hand bat,Australia,171.0,4.0,0.0,20.0,120.0,3.0,0.0,1.0,2.0,0.0,2012-02-01 00:00:00+00:00,"Stadium Australia, Sydney",night,India tour of Australia,Australia,India,Sydney,india-tour-of-australia-2011-12,Australia,31 runs,1.0,Ravichandran Ashwin,right-arm offbreak,1.0,0,0,2,0.0,2.0,0.0,0.0,2,118,0.0,0.0,10,RA_OB,spin,2012.0,P1,yes
378412,1,3,,0.03,0.3,0,0,False,False,False,0,0,0,0,0,236,214,3,,,OFF_SIDE_DRIVE_ON_FRONT_FOOT,48739,12894,,0,0,Ashwin to Warner,518954_1,518954_1_1_3,,,,quicker delivery and it forces him forward to ...,518954,1,David Warner,left-hand bat,Australia,171.0,4.0,0.0,20.0,120.0,3.0,0.0,1.0,2.0,0.0,2012-02-01 00:00:00+00:00,"Stadium Australia, Sydney",night,India tour of Australia,Australia,India,Sydney,india-tour-of-australia-2011-12,Australia,31 runs,1.0,Ravichandran Ashwin,right-arm offbreak,1.0,0,0,3,0.0,3.0,0.0,0.0,3,117,0.0,0.0,10,RA_OB,spin,2012.0,P1,yes
378411,1,4,,0.04,0.4,1,1,False,False,False,0,0,0,0,0,100,229,6,,,LEG_GLANCE,48739,12894,,1,0,Ashwin to Warner,518954_1,518954_1_1_4,,,,"slower through the air, rocks back and clips i...",518954,1,David Warner,left-hand bat,Australia,171.0,4.0,0.0,20.0,120.0,3.0,0.0,1.0,2.0,0.0,2012-02-01 00:00:00+00:00,"Stadium Australia, Sydney",night,India tour of Australia,Australia,India,Sydney,india-tour-of-australia-2011-12,Australia,31 runs,1.0,Ravichandran Ashwin,right-arm offbreak,1.0,0,0,4,1.0,4.0,1.0,-1.0,4,116,1.5,-0.051724,10,RA_OB,spin,2012.0,P1,yes
378410,1,5,,0.05,0.5,0,0,False,False,False,0,0,0,0,0,252,148,2,,,CUT_SHOT_ON_BACK_FOOT,49024,12894,,1,0,Ashwin to Wade,518954_1,518954_1_1_5,,,,waits for it and and steers it to the off side...,518954,1,Matthew Wade,left-hand bat,Australia,171.0,4.0,0.0,20.0,120.0,3.0,0.0,1.0,2.0,0.0,2012-02-01 00:00:00+00:00,"Stadium Australia, Sydney",night,India tour of Australia,Australia,India,Sydney,india-tour-of-australia-2011-12,Australia,31 runs,2.0,Ravichandran Ashwin,right-arm offbreak,1.0,0,0,5,0.0,1.0,0.0,0.0,5,115,1.2,0.0,10,RA_OB,spin,2012.0,P1,yes


# MODEL BUILDING

In [9]:
# params = {
#         'min_child_weight': [1, 5, 10],
#         'max_depth': [3, 7, 10],
#         'learning_rate' : [0.005, 0.01, 0.2, 0.5, 0.8],
#         'n_estimators' : [400, 700, 1000]
#         }

# model2 = XGBRegressor(seed=123)

# grid_search = GridSearchCV(
#     estimator=model1,
#     param_grid=params,
#     scoring = 'explained_variance',
#     n_jobs = 10,
#     cv = 2,
#     verbose=True
# )

# grid_search.best_estimator_.get_params()

EXPECTED RUNS AND RUNS ABOVE AVERAGE

In [None]:
#RAA
np.random.seed(123)

inns1 = bbb_espn.query("inns_no==1").reset_index(drop=True)
print(inns1.shape)
inns1['inns_RR_1ball_bef'] = inns1.groupby(['inns_id']).inns_RR.shift(1)
inns1['wkts_rem_1ball_bef'] = inns1.groupby(['inns_id']).totalInningWickets1.shift(1)
inns1['inns_balls_remain'] = inns1['inns_balls_remain']+1
f_inns1 = ['inns_RR_1ball_bef', 'wkts_rem_1ball_bef', 'inns_balls_remain', 'batsmanRuns']

inns2 = bbb_espn.query("inns_no==2").reset_index(drop=True)
print(inns2.shape)
inns2['inns_RR_1ball_bef'] = inns2.groupby(['inns_id']).inns_RR.shift(1)
inns2['inns_RRR_1ball_bef'] = inns2.groupby(['inns_id']).inns_RRR.shift(1)
inns2['wkts_rem_1ball_bef'] = inns2.groupby(['inns_id']).totalInningWickets1.shift(1)
inns2['inns_balls_remain'] = inns2['inns_balls_remain']+1
f_inns2 = ['inns_RR_1ball_bef', 'inns_RRR_1ball_bef', 'wkts_rem_1ball_bef', 'inns_balls_remain', 'batsmanRuns']

req1  = np.random.choice(inns1.inns_id.unique(), size=int(round(len(inns1.inns_id.unique())*0.1, 0)), replace=False)
req2  = np.random.choice(inns2.inns_id.unique(), size=int(round(len(inns2.inns_id.unique())*0.1, 0)), replace=False)

inns1_train = inns1.query("~inns_id.isin(@req1)")
inns1_test = inns1.query("inns_id.isin(@req1)")

inns2_train = inns2.query("~inns_id.isin(@req2)")
inns2_test = inns2.query("inns_id.isin(@req2)")

# params = {
#         'min_child_weight': [1, 5, 8, 10],
#         'max_depth': [3, 5, 7, 9, 10],
#         'learning_rate' : [0.01, 0.05, 0.08, 0.1, 0.2, 0.5, 0.8],
#         'n_estimators' : [400, 600, 800, 1000]
#         }
# total combinations : 560

# LinregressResult(slope=0.7260158610885565, intercept=37.54591759214887, rvalue=0.864184304781776, pvalue=0.0, stderr=0.003191772761314065, intercept_stderr=0.454711006355789)
# model1 = XGBRegressor(learning_rate=0.5, max_depth=8, n_estimators=500, min_child_weight=1)
# model2 = XGBRegressor(learning_rate=0.5, max_depth=8, n_estimators=500, min_child_weight=1)

# LinregressResult(slope=0.7570781663284472, intercept=32.99422830074745, rvalue=0.8860206167211409, pvalue=0.0, stderr=0.002991273262444784, intercept_stderr=0.42614715306093615)
# model1 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=800, min_child_weight=5)
# model2 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=800, min_child_weight=5)

model1 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=1000, min_child_weight=8, seed=123)
model2 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=800, min_child_weight=5, seed=123)

# LinregressResult(slope=0.7368101436353484, intercept=35.79987633681239, rvalue=0.8721584658158169, pvalue=0.0, stderr=0.003120615857918432, intercept_stderr=0.4445737473552864)
# model1 = XGBRegressor(learning_rate=0.5, max_depth=10, n_estimators=800, min_child_weight=10)
# model2 = XGBRegressor(learning_rate=0.5, max_depth=10, n_estimators=800, min_child_weight=10)

# WHEN TRAINED AND TESTED ON SAME DATA LinregressResult(slope=0.6585272096425859, intercept=49.66798377615004, rvalue=0.8484954641099838, pvalue=0.0, stderr=0.004367388684902399, intercept_stderr=0.6544361472854016)
# model1 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=1000, min_child_weight=8)

# LinregressResult(slope=0.6657074620256712, intercept=48.44043775695228, rvalue=0.8425571129341449, pvalue=1.6451842791418427e-239, stderr=0.014321078510475907, intercept_stderr=2.152731193240927)
# model1 = XGBRegressor(learning_rate=0.2, max_depth=8, n_estimators=600, min_child_weight=8)
# USE THIS FOR TEST TRAIN SPLITS

# LinregressResult(slope=0.6682784345042527, intercept=48.13781502640809, rvalue=0.8411993254946545, pvalue=5.272917491933809e-238, stderr=0.01445622016930397, intercept_stderr=2.173045561621261)
# model1 = XGBRegressor(learning_rate=0.6, max_depth=8, n_estimators=600, min_child_weight=10)

# LinregressResult(slope=0.6652713064135554, intercept=48.56742373370783, rvalue=0.8405075037304309, pvalue=3.046921612455998e-237, stderr=0.014431642750313766, intercept_stderr=2.1693511068725586)
# model1 = XGBRegressor(learning_rate=0.6, max_depth=8, n_estimators=400, min_child_weight=10)

# LinregressResult(slope=0.7337311062500088, intercept=34.10072366758543, rvalue=0.8315422222188092, pvalue=7.449400212355632e-224, stderr=0.01663591772976255, intercept_stderr=2.2632480759709406)
# model2 = XGBRegressor(learning_rate=0.02, max_depth=4, n_estimators=600, min_child_weight=7)
# USE THIS FOR TEST TRAIN SPLITS

## LinregressResult(slope=0.7690090044974333, intercept=27.687969545246887, rvalue=0.8399794418397505, pvalue=1.1199446833103733e-232, stderr=0.016861541713639436, intercept_stderr=2.2876220394308713)
# LinregressResult(slope=0.7227670772626491, intercept=35.942036821771225, rvalue=0.8220953586461162, pvalue=1.5364941483493187e-214, stderr=0.016990082712073473, intercept_stderr=2.2920171897035324)
# model2 = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=600, min_child_weight=7)

# LinregressResult(slope=0.7207274805083628, intercept=34.89554055451178, rvalue=0.8177398473423845, pvalue=1.9800377539880723e-210, stderr=0.017218994478272093, intercept_stderr=2.3228981284219645)
# model2 = XGBRegressor(learning_rate=0.2, max_depth=4, n_estimators=1000, min_child_weight=5)

# WHEN TRAINED AND TESTED ON SAME DATA LinregressResult(slope=0.8362165049148709, intercept=20.319135629251974, rvalue=0.9165225384086934, pvalue=0.0, stderr=0.003913653529295118, intercept_stderr=0.5265406398247563)
# model2 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=800, min_child_weight=5)

model1.fit(inns1_train[f_inns1[0:3]].replace(np.inf, np.nan).fillna(0), inns1_train[f_inns1[3]].replace(np.inf, np.nan).fillna(0))
model2.fit(inns2_train[f_inns2[0:4]].replace(np.inf, np.nan).fillna(0), inns2_train[f_inns2[4]].replace(np.inf, np.nan).fillna(0))

# model1.fit(inns1[f_inns1[0:3]].replace(np.inf, np.nan).fillna(0), inns1[f_inns1[3]].replace(np.inf, np.nan).fillna(0))
# model2.fit(inns2[f_inns2[0:4]].replace(np.inf, np.nan).fillna(0), inns2[f_inns2[4]].replace(np.inf, np.nan).fillna(0))

# inns1["avg_runs"] = model1.predict(inns1[f_inns1[0:3]])
# inns2["avg_runs"] = model2.predict(inns2[f_inns2[0:4]])
# inns1 = inns1.drop(['wkts_rem_1ball_bef', 'inns_RR_1ball_bef', 'inns_balls_remain'], axis=1)
# inns2 = inns2.drop(['wkts_rem_1ball_bef', 'inns_RR_1ball_bef', 'inns_RRR_1ball_bef', 'inns_balls_remain'], axis=1)

inns1_test["avg_runs"] = model1.predict(inns1_test[f_inns1[0:3]])
inns1_test = inns1_test.drop(['wkts_rem_1ball_bef', 'inns_RR_1ball_bef', 'inns_balls_remain'], axis=1)
inns2_test["avg_runs"] = model2.predict(inns2_test[f_inns2[0:4]])
inns2_test = inns2_test.drop(['wkts_rem_1ball_bef', 'inns_RR_1ball_bef', 'inns_RRR_1ball_bef', 'inns_balls_remain'], axis=1)

# inns_concat_raa = pd.concat([inns1,inns2])
# inns_concat_raa['RAA'] = inns_concat_raa['batsmanRuns']-inns_concat_raa['avg_runs']

# a = inns_concat_raa.groupby("inns_id").agg(inns_runs=("batsmanRuns", "sum"), inns_avg_runs=("avg_runs", "sum"))
# print(linregress(a['inns_runs'], a['inns_avg_runs']))

print(linregress(inns1_test.groupby("inns_id").agg(inns_runs=("batsmanRuns", "sum"), inns_avg_runs=("avg_runs", "sum"))))
print(linregress(inns2_test.groupby("inns_id").agg(inns_runs=("batsmanRuns", "sum"), inns_avg_runs=("avg_runs", "sum"))))

# with open("xgbR_RAA_inns1.pkl", 'wb') as file:  
#     pickle.dump(model1, file)
    
# with open("xgbR_RAA_inns2.pkl", 'wb') as file:  
#     pickle.dump(model2, file)
    
# sns.scatterplot(data=a, x='inns_runs', y='inns_avg_runs')
# plt.xlabel("innings actual runs")
# plt.ylabel("innings expected runs")
# plt.show()

# a['ir_bins'] = pd.cut(a['inns_runs'], np.arange(a['inns_runs'].min(),a['inns_runs'].max()+1, 2))
# a1 = a.groupby(['ir_bins']).agg(inns_runs=("inns_runs", "mean"), inns_avg_runs=("inns_avg_runs", "mean")).query("inns_runs.between(120,220, inclusive='both')")
# sns.scatterplot(a1, x='inns_runs', y='inns_avg_runs')
# plt.show()

# print(linregress(a1['inns_runs'], a1['inns_avg_runs']))
# print(linregress(a['inns_runs'], a['inns_avg_runs']))

# ai2 = a.loc[a.index.str.contains("_2")]
# ai2['ir_bins'] = pd.cut(ai2['inns_runs'], np.arange(ai2['inns_runs'].min(),ai2['inns_runs'].max()+1, 2))
# a22 = ai2.groupby(['ir_bins']).agg(inns_runs=("inns_runs", "mean"), 
#                                 inns_avg_runs=("inns_avg_runs", "mean")).query("inns_runs.between(120,220, inclusive='both')")
# print(linregress(a22['inns_runs'], a22['inns_avg_runs']))

# ai1 = a.loc[a.index.str.contains("_1")]
# ai1['ir_bins'] = pd.cut(ai1['inns_runs'], np.arange(ai1['inns_runs'].min(),ai1['inns_runs'].max()+1, 2))
# a21 = ai1.groupby(['ir_bins']).agg(inns_runs=("inns_runs", "mean"), 
#                                 inns_avg_runs=("inns_avg_runs", "mean")).query("inns_runs.between(120,220, inclusive='both')")
# print(linregress(a21['inns_runs'], a21['inns_avg_runs']))

SCORE PREDICTOR

In [None]:
# SCORE PREDICTOR

np.random.seed(123)

inns1 = bbb_espn.query("inns_no==1").reset_index(drop=True)
print(inns1.shape)
f_inns1 = ['totalInningRuns', 'wkt_rem', 'inns_balls_remain', 'runs_inns']

inns2 = bbb_espn.query("inns_no==2").reset_index(drop=True)
print(inns2.shape)
f_inns2 = ['totalInningRuns', 'runs_req', 'wkt_rem', 'inns_balls_remain', 'runs_inns']

req1  = np.random.choice(inns1.inns_id.unique(), size=int(round(len(inns1.inns_id.unique())*0.1, 0)), replace=False)
req2  = np.random.choice(inns2.inns_id.unique(), size=int(round(len(inns2.inns_id.unique())*0.1, 0)), replace=False)

inns1_train = inns1.query("~inns_id.isin(@req1)")
inns1_test = inns1.query("inns_id.isin(@req1)")

inns2_train = inns2.query("~inns_id.isin(@req2)")
inns2_test = inns2.query("inns_id.isin(@req2)")

model1 = XGBRegressor(learning_rate=0.8, max_depth=8, n_estimators=1000, min_child_weight=8, seed=123)
model2 = XGBRegressor(learning_rate=0.8, max_depth=7, n_estimators=800, min_child_weight=5, seed=123)

model1.fit(inns1_train.query("over>1")[f_inns1[0:3]].replace(np.inf, np.nan).fillna(0), 
           inns1_train.query("over>1")[f_inns1[3]].replace(np.inf, np.nan).fillna(0))
model2.fit(inns2_train.query("over>1")[f_inns2[0:4]].replace(np.inf, np.nan).fillna(0), 
           inns2_train.query("over>1")[f_inns2[4]].replace(np.inf, np.nan).fillna(0))

# model1.fit(inns1.query("over>1")[f_inns1[0:3]].replace(np.inf, np.nan).fillna(0), 
#            inns1.query("over>1")[f_inns1[3]].replace(np.inf, np.nan).fillna(0))
# model2.fit(inns2.query("over>1")[f_inns2[0:4]].replace(np.inf, np.nan).fillna(0), 
#            inns2.query("over>1")[f_inns2[4]].replace(np.inf, np.nan).fillna(0))

# inns1["inns_predicted_runs"] = model1.predict(inns1[f_inns1[0:3]])
# inns2["inns_predicted_runs"] = model2.predict(inns2[f_inns2[0:4]])

inns1_test["inns_predicted_runs"] = model1.predict(inns1_test[f_inns1[0:3]])
inns2_test["inns_predicted_runs"] = model2.predict(inns2_test[f_inns2[0:4]])

# inns_concat_raa = pd.concat([inns1,inns2])

# print(linregress(inns_concat_raa.groupby("inns_id").agg(actual_runs=("runs_inns", "max"), 
#                                        avg_pred_runs=("inns_predicted_runs", "mean"))))
# sns.scatterplot(data=inns_concat_raa.groupby("inns_id").agg(actual_runs=("runs_inns", "max"), 
#                                                             avg_pred_runs=("inns_predicted_runs", "mean")),
#                 x='actual_runs', y='avg_pred_runs')
# plt.show()

# a = inns_concat_raa.groupby("inns_id").agg(actual_runs=("runs_inns", "max"), 
#                                                             avg_pred_runs=("inns_predicted_runs", "mean"))
# a['ar_bins'] = pd.cut(a.actual_runs, np.arange(a.actual_runs.min()-1, a.actual_runs.max()+1, 5))
# b = a.groupby("ar_bins").agg(ar_avg=("actual_runs","mean"), pr_avg=("avg_pred_runs", "mean")).query("ar_avg>40")
# sns.scatterplot(b, x='ar_avg', y='pr_avg')

print(linregress(inns1_test.groupby("inns_id").agg(actual_runs=("runs_inns", "max"), 
                                                        avg_pred_runs=("inns_predicted_runs", "mean"))))
print(linregress(inns2_test.groupby("inns_id").agg(actual_runs=("runs_inns", "max"), 
                                                        avg_pred_runs=("inns_predicted_runs", "mean"))))

# print(mean_squared_error(inns1_test.query("over.between(6,18,inclusive='both')")['inns_predicted_runs'], 
#                          inns1_test.query("over.between(6,18,inclusive='both')")['runs_inns'], squared=False))
# print(mean_squared_error(inns2_test.query("over.between(6,18,inclusive='both')")['inns_predicted_runs'], 
#                          inns2_test.query("over.between(6,18,inclusive='both')")['runs_inns'], squared=False))

EXPECTED WICKETS

In [31]:
# EXPECTED WICKETS

np.random.seed(123)

inns1 = bbb_espn.query("inns_no==1").reset_index(drop=True)
print(inns1.shape)
inns1['inns_RR_1ball_bef'] = inns1.groupby(['inns_id']).inns_RR.shift(1)
inns1['wkts_rem_1ball_bef'] = inns1.groupby(['inns_id']).totalInningWickets1.shift(1)
inns1['inns_balls_remain'] = inns1['inns_balls_remain']+1
f_inns1 = ['inns_RR_1ball_bef', 'wkts_rem_1ball_bef', 'inns_balls_remain', 'wicket_flag']

inns2 = bbb_espn.query("inns_no==2").reset_index(drop=True)
print(inns2.shape)
inns2['inns_RR_1ball_bef'] = inns2.groupby(['inns_id']).inns_RR.shift(1)
inns2['inns_RRR_1ball_bef'] = inns2.groupby(['inns_id']).inns_RRR.shift(1)
inns2['wkts_rem_1ball_bef'] = inns2.groupby(['inns_id']).totalInningWickets1.shift(1)
inns2['inns_balls_remain'] = inns2['inns_balls_remain']+1
f_inns2 = ['inns_RR_1ball_bef', 'inns_RRR_1ball_bef', 'wkts_rem_1ball_bef', 'inns_balls_remain', 'wicket_flag']

req1  = np.random.choice(inns1.inns_id.unique(), size=int(round(len(inns1.inns_id.unique())*0.1, 0)), replace=False)
req2  = np.random.choice(inns2.inns_id.unique(), size=int(round(len(inns2.inns_id.unique())*0.1, 0)), replace=False)

inns1_train = inns1.query("~inns_id.isin(@req1)")
inns1_test = inns1.query("inns_id.isin(@req1)")

inns2_train = inns2.query("~inns_id.isin(@req2)")
inns2_test = inns2.query("inns_id.isin(@req2)")

model1 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=1000, min_child_weight=8, seed=123)
model2 = XGBRegressor(learning_rate=0.8, max_depth=10, n_estimators=800, min_child_weight=5, seed=123)

model1.fit(inns1[f_inns1[0:3]].replace(np.inf, np.nan).fillna(0), inns1[f_inns1[3]].replace(np.inf, np.nan).fillna(0))
model2.fit(inns2[f_inns2[0:4]].replace(np.inf, np.nan).fillna(0), inns2[f_inns2[4]].replace(np.inf, np.nan).fillna(0))

inns1["xW"] = model1.predict(inns1[f_inns1[0:3]])
inns2["xW"] = model2.predict(inns2[f_inns2[0:4]])
inns1 = inns1.drop(['wkts_rem_1ball_bef', 'inns_RR_1ball_bef', 'inns_balls_remain'], axis=1)
inns2 = inns2.drop(['wkts_rem_1ball_bef', 'inns_RR_1ball_bef', 'inns_RRR_1ball_bef', 'inns_balls_remain'], axis=1)

inns_concat_raa = pd.concat([inns1,inns2])
# inns_concat_raa['wkts_AA'] = inns_concat_raa['batsmanRuns']-inns_concat_raa['xW']

# a = inns_concat_raa.groupby("inns_id").agg(inns_xW=("xW", "sum"), inns_wkts=("wicket_flag", "sum"))
# print(linregress(a['inns_xW'], a['inns_wkts']))
# sns.scatterplot(x=a['inns_xW'], y=a['inns_wkts'])

(1064883, 79)
(954533, 79)
