In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('all_season_details.csv',dtype={'batsman2_id': str})

In [3]:
df.head()

Unnamed: 0,comment_id,season,match_id,match_name,home_team,away_team,current_innings,innings_id,over,ball,...,wicket_id,wkt_batsman_name,wkt_bowler_name,wkt_batsman_runs,wkt_batsman_balls,wkt_text,isRetiredHurt,text,preText,postText
0,110,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,1,...,,,,,,,False,"nice and full, angling and perhaps swinging in...",<p><strong>7.30pm</strong> Gaikwad and Conway ...,
1,120,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,2,...,,,,,,,False,"Conway shuffles across off, and shimmies out a...",,
2,130,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,3,...,,,,,,,False,"good length, angling in at off and then straig...",,
3,140,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,4,...,,,,,,,False,"shorter and slanting into middle and leg, Gaik...","<p>Rummy: ""Fast bowling options for CSK defini...",
4,150,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,5,...,,,,,,,False,Shami hits a heavy length and brings this back...,"<p>LG: ""Watching out for Hangargekar. He was i...",


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242550 entries, 0 to 242549
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   comment_id         242550 non-null  int64  
 1   season             242067 non-null  float64
 2   match_id           242550 non-null  int64  
 3   match_name         242550 non-null  object 
 4   home_team          242550 non-null  object 
 5   away_team          242550 non-null  object 
 6   current_innings    242550 non-null  object 
 7   innings_id         242550 non-null  int64  
 8   over               242550 non-null  int64  
 9   ball               242550 non-null  int64  
 10  runs               242550 non-null  int64  
 11  shortText          242550 non-null  object 
 12  isBoundary         242550 non-null  bool   
 13  isWide             242550 non-null  bool   
 14  isNoball           242550 non-null  bool   
 15  batsman1_id        242550 non-null  int64  
 16  ba

In [5]:
df.describe()

Unnamed: 0,comment_id,season,match_id,innings_id,over,ball,runs,batsman1_id,batsman1_runs,batsman1_balls,...,batsman2_runs,batsman2_balls,bowler2_id,bowler2_overs,bowler2_maidens,bowler2_runs,bowler2_wkts,wicket_id,wkt_batsman_runs,wkt_batsman_balls
count,242550.0,242067.0,242550.0,242550.0,242550.0,242550.0,242550.0,242550.0,242550.0,242550.0,...,242550.0,242550.0,229709.0,229709.0,229709.0,229709.0,229709.0,11880.0,11880.0,11880.0
mean,86415.531767,2015.588048,870176.9,1.483245,10.161678,3.483875,1.317901,250305.7,18.047631,14.119819,...,16.551618,13.219233,315165.0,2.198914,0.024483,16.235019,0.551868,263527.3,18.815404,15.311869
std,79516.360959,4.639506,353067.1,0.49972,5.658053,1.707451,1.612821,296880.6,18.341545,11.905881,...,17.771899,11.892068,322205.6,1.046243,0.155527,10.171336,0.773715,303991.5,19.542348,12.695239
min,110.0,2008.0,335982.0,1.0,1.0,1.0,0.0,4292.0,0.0,0.0,...,0.0,0.0,4508.0,0.1,0.0,0.0,0.0,4292.0,0.0,0.0
25%,18040.0,2012.0,548315.0,1.0,5.0,2.0,0.0,32966.0,4.0,5.0,...,3.0,4.0,33335.0,1.0,0.0,8.0,0.0,33141.0,4.0,5.0
50%,29010.0,2015.0,829819.0,1.0,10.0,3.0,1.0,52912.0,12.0,11.0,...,11.0,10.0,265564.0,2.0,0.0,14.0,0.0,211854.0,12.0,12.0
75%,118060.0,2020.0,1216506.0,2.0,15.0,5.0,1.0,379143.0,27.0,20.0,...,25.0,20.0,475281.0,3.0,0.0,23.0,1.0,398439.0,28.0,22.0
max,219080.0,2023.0,1370353.0,2.0,20.0,7.0,7.0,1349361.0,174.0,73.0,...,174.0,73.0,1350792.0,4.0,2.0,70.0,5.0,1312645.0,129.0,69.0


In [6]:
df.shape

(242550, 45)

# Preprocessing

In [8]:
score_df =df[['season', 'match_id', 'home_team', 'away_team', 'current_innings', 'over', 'ball', 
                                         'wkt_batsman_runs', 'isRetiredHurt']].copy()

In [9]:
score_df['wkt_batsman_runs'] = score_df['wkt_batsman_runs'].fillna(0)

In [10]:
score_df['cumulative_runs'] = score_df.groupby(['match_id', 'current_innings'])['wkt_batsman_runs'].cumsum().reset_index(drop=True)


In [11]:
score_df['wickets_lost'] = score_df.groupby(['match_id', 'current_innings'])['wkt_batsman_runs'].apply(lambda x: (x > 0).cumsum()).reset_index(drop=True)


In [12]:
score_df['overs_bowled'] = score_df['over'] + (score_df['ball'] / 6)

In [13]:
score_df['run_rate'] = score_df['cumulative_runs'] / score_df['overs_bowled']

In [14]:
score_df['powerplay'] = score_df['over'] < 6

# Base Model Training and Evaluation

In [16]:
X = score_df[['overs_bowled', 'wickets_lost', 'run_rate', 'powerplay']]
y = score_df['cumulative_runs']

In [17]:
# Convert 'powerplay' from boolean to int
X = X.copy(deep=True)
X['powerplay'] = X['powerplay'].astype(int)
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Initialize Random Forest model
rf_model_simple = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42,min_samples_leaf=1,min_samples_split=2)
# Train the model
rf_model_simple.fit(X_train, y_train)

In [20]:
y_rf_simple_pred = rf_model_simple.predict(X_test)
# Evaluate performance
rf_simple_mse = mean_squared_error(y_test, y_rf_simple_pred)
rf_simple_r2 = r2_score(y_test, y_rf_simple_pred)
rf_simple_mse, rf_simple_r2

(0.15980264739229028, 0.9999045513977854)

In [21]:
new_data = pd.DataFrame({
    'overs_bowled': [10.2],   # Example: 10 overs and 2 balls bowled
    'wickets_lost': [2],      # Example: 2 wickets lost
    'run_rate': [7.5],        # Example: current run rate is 7.5
    'powerplay': [0]          # Example: not in powerplay (after 6 overs)
})

# Predict total runs for this scenario
predicted_runs = rf_model_simple.predict(new_data)
print(f"Predicted Cumulative Runs: {predicted_runs[0]}")

Predicted Cumulative Runs: 76.015
