In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,cross_val_score

In [2]:
df = pd.read_csv('all_season_details_data_after_eda.csv',low_memory = False,index_col=0)

In [3]:
df.head(2)

Unnamed: 0_level_0,season,match_id,match_name,home_team,away_team,current_innings,innings_id,over,ball,runs,...,wicket_id,wkt_batsman_name,wkt_bowler_name,wkt_batsman_runs,wkt_batsman_balls,wkt_text,isRetiredHurt,text,preText,postText
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
110,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,1,0,...,,,,,,,False,"nice and full, angling and perhaps swinging in...",<p><strong>7.30pm</strong> Gaikwad and Conway ...,
120,2023.0,1359475,GT v CSK,GT,CSK,CSK,1,1,2,1,...,,,,,,,False,"Conway shuffles across off, and shimmies out a...",,


In [4]:
df.columns

Index(['season', 'match_id', 'match_name', 'home_team', 'away_team',
       'current_innings', 'innings_id', 'over', 'ball', 'runs', 'shortText',
       'isBoundary', 'isWide', 'isNoball', 'batsman1_id', 'batsman1_name',
       'batsman1_runs', 'batsman1_balls', 'bowler1_id', 'bowler1_name',
       'bowler1_overs', 'bowler1_maidens', 'bowler1_runs', 'bowler1_wkts',
       'batsman2_id', 'batsman2_name', 'batsman2_runs', 'batsman2_balls',
       'bowler2_id', 'bowler2_name', 'bowler2_overs', 'bowler2_maidens',
       'bowler2_runs', 'bowler2_wkts', 'wicket_id', 'wkt_batsman_name',
       'wkt_bowler_name', 'wkt_batsman_runs', 'wkt_batsman_balls', 'wkt_text',
       'isRetiredHurt', 'text', 'preText', 'postText'],
      dtype='object')

In [5]:
df['season'].fillna(2023, inplace=True)

In [6]:
df.loc[(df['match_id'].isin([1359496,1359538])) & (df['season'].isnull()), 'season'] = 2023

In [7]:
# Select the required columns and create a copy
ball_by_ball_df = df[['season', 'match_id', 'home_team', 'away_team', 'current_innings',
                      'innings_id', 'over', 'ball', 'runs', 'wicket_id', 
                      'wkt_batsman_name', 'wkt_batsman_runs']].copy()

# Step 1: Calculate cumulative runs for each match and innings
ball_by_ball_df['cumulative_runs'] = ball_by_ball_df.groupby(['match_id', 'current_innings'])['runs'].cumsum()

# Step 2: Calculate run rate by dividing cumulative runs by the over (adding a small value to avoid division by zero)
ball_by_ball_df['run_rate'] = ball_by_ball_df['cumulative_runs'] / (ball_by_ball_df['over'] + 0.1)

# Step 3: Calculate wickets lost by counting instances where `wkt_batsman_runs` is non-negative
ball_by_ball_df['wickets_lost'] = ball_by_ball_df.groupby(['match_id', 'current_innings'])['wkt_batsman_runs']\
                                                 .transform(lambda x: (x >= 0).cumsum())

# Step 4: Fill any NaN values in the DataFrame with 0
ball_by_ball_df.fillna(0, inplace=True)

In [8]:
ball_by_ball_df.head(10)

Unnamed: 0_level_0,season,match_id,home_team,away_team,current_innings,innings_id,over,ball,runs,wicket_id,wkt_batsman_name,wkt_batsman_runs,cumulative_runs,run_rate,wickets_lost
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
110,2023.0,1359475,GT,CSK,CSK,1,1,1,0,0.0,0,0.0,0,0.0,0
120,2023.0,1359475,GT,CSK,CSK,1,1,2,1,0.0,0,0.0,1,0.909091,0
130,2023.0,1359475,GT,CSK,CSK,1,1,3,0,0.0,0,0.0,1,0.909091,0
140,2023.0,1359475,GT,CSK,CSK,1,1,4,1,0.0,0,0.0,2,1.818182,0
150,2023.0,1359475,GT,CSK,CSK,1,1,5,0,0.0,0,0.0,2,1.818182,0
160,2023.0,1359475,GT,CSK,CSK,1,1,6,0,0.0,0,0.0,2,1.818182,0
11010,2023.0,1359475,GT,CSK,CSK,1,2,1,4,0.0,0,0.0,6,2.857143,0
11020,2023.0,1359475,GT,CSK,CSK,1,2,2,0,0.0,0,0.0,6,2.857143,0
11030,2023.0,1359475,GT,CSK,CSK,1,2,3,4,0.0,0,0.0,10,4.761905,0
11040,2023.0,1359475,GT,CSK,CSK,1,2,4,1,0.0,0,0.0,11,5.238095,0


In [10]:
from sklearn.preprocessing import LabelEncoder
import pickle
# Initialize LabelEncoder
team_encoder = LabelEncoder()

# Fit and transform the `home_team`, `away_team`, and `current_innings`
ball_by_ball_df['home_team_encoded'] = team_encoder.fit_transform(ball_by_ball_df['home_team'])
ball_by_ball_df['away_team_encoded'] = team_encoder.transform(ball_by_ball_df['away_team'])
ball_by_ball_df['current_innings_encoded'] = team_encoder.transform(ball_by_ball_df['current_innings'])

# Save the encoder for prediction usage
with open('team_encoder.pkl', 'wb') as f:
    pickle.dump(team_encoder, f)


In [11]:
X = ball_by_ball_df[['over', 'ball', 'run_rate', 'wickets_lost',
                     'home_team_encoded', 'away_team_encoded', 'current_innings_encoded']]
Y = ball_by_ball_df['cumulative_runs']


In [12]:

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
x_train

Unnamed: 0_level_0,over,ball,run_rate,wickets_lost,home_team_encoded,away_team_encoded,current_innings_encoded
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
111050,12,4,5.123967,3,10,4,10
116010,17,1,8.596491,2,1,14,1
150,1,5,4.545455,1,3,1,1
17030,8,3,8.395062,0,13,11,13
18010,9,1,6.153846,2,14,0,14
...,...,...,...,...,...,...,...
19050,10,5,7.029703,3,4,14,14
211030,12,3,8.016529,1,14,8,14
24080,5,6,9.607843,2,5,11,11
117050,18,5,7.182320,5,4,14,14


In [14]:
y_train

comment_id
111050     62
116010    147
150         5
17030      68
18010      56
         ... 
19050      71
211030     97
24080      49
117050    130
29050      74
Name: cumulative_runs, Length: 194040, dtype: int64

In [15]:
x_test

Unnamed: 0_level_0,over,ball,run_rate,wickets_lost,home_team_encoded,away_team_encoded,current_innings_encoded
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19040,10,4,4.455446,3,13,8,8
14010,5,1,5.294118,0,8,0,8
119050,20,5,9.402985,6,3,14,14
11040,2,4,0.952381,2,0,8,0
217060,18,6,5.856354,5,14,5,14
...,...,...,...,...,...,...,...
116040,17,4,8.304094,3,3,1,3
213020,14,2,9.503546,3,0,1,0
11060,2,6,6.190476,0,4,13,13
213060,14,6,6.241135,2,4,8,4


In [16]:
y_test

comment_id
19040      45
14010      27
119050    189
11040       2
217060    106
         ... 
116040    142
213020    134
11060      13
213060     88
14030      52
Name: cumulative_runs, Length: 48510, dtype: int64

In [17]:
rf_model=RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42,min_samples_leaf=1,min_samples_split=2)

In [18]:
rf_model.fit(x_train,y_train)

In [19]:
y_rf_simple_pred=rf_model.predict(x_test)

In [20]:
rf_mse=mean_squared_error(y_test,y_rf_simple_pred)
rf_r2=r2_score(y_test,y_rf_simple_pred)

In [21]:
# Display evaluation metrics
print("Random Forest Regressor:")
print(f"  Mean Squared Error: {rf_mse}")
print(f"  R-squared: {rf_r2}\n")


Random Forest Regressor:
  Mean Squared Error: 0.05296923160173164
  R-squared: 0.9999788189812224



In [22]:
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


In [23]:
with open('xgb_model.pkl', 'rb') as f:
    model = pickle.load(f)


In [24]:
with open('team_encoder.pkl', 'rb') as f:
    team_encoder = pickle.load(f)

In [25]:
home_team = 'CSK'
away_team = 'MI'
current_innings = 'CSK'

home_team_encoded = team_encoder.transform([home_team])[0]
away_team_encoded = team_encoder.transform([away_team])[0]
current_innings_encoded = team_encoder.transform([current_innings])[0]

In [26]:
new_data = pd.DataFrame({
    'over': [10],  
    'ball': [2],      
    'run_rate': [10],        
    'wickets_lost': [9],    
    'home_team_encoded': [home_team_encoded],
    'away_team_encoded': [away_team_encoded],
    'current_innings_encoded': [current_innings_encoded]
})

In [27]:
predicted_runs = model.predict(new_data)
print(f"Predicted Cumulative Runs: {predicted_runs[0]}")

Predicted Cumulative Runs: 101.0


In [28]:
from sklearn.linear_model import LinearRegression


lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

y_pred_lr = lr_model.predict(x_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression:")
print(f"  Mean Squared Error: {mse_lr}")
print(f"  R-squared: {r2_lr}\n")

Linear Regression:
  Mean Squared Error: 265.1154304267054
  R-squared: 0.8939872310719359



In [29]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(x_train, y_train)

y_pred_dt = dt_model.predict(x_test)

mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("Decision Tree Regressor:")
print(f"  Mean Squared Error: {mse_dt}")
print(f"  R-squared: {r2_dt}\n")

Decision Tree Regressor:
  Mean Squared Error: 0.13766233766233765
  R-squared: 0.9999449524097137



In [30]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(x_train, y_train)

y_pred_gb = gb_model.predict(x_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Regressor:")
print(f"  Mean Squared Error: {mse_gb}")
print(f"  R-squared: {r2_gb}\n")

Gradient Boosting Regressor:
  Mean Squared Error: 3.831470830387837
  R-squared: 0.9984678944143585



In [31]:
from xgboost import XGBRegressor


xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(x_train, y_train)

y_pred_xgb = xgb_model.predict(x_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Regressor:")
print(f"  Mean Squared Error: {mse_xgb}")
print(f"  R-squared: {r2_xgb}\n")

XGBoost Regressor:
  Mean Squared Error: 1.3238479603240592
  R-squared: 0.9994706276142138



In [34]:
import pickle
from xgboost import XGBRegressor

# Assuming xgb_model is your trained model
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(x_train, y_train)

# Save the model to a .pkl file using pickle
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

print("Model saved successfully!")


Model saved successfully!
