In [23]:
import pandas as pd
import numpy as np

# Load the datasets
matches_df = pd.read_csv('matches.csv')
deliveries_df = pd.read_csv('deliveries.csv')

# Convert date to datetime format in matches_df
matches_df['date'] = pd.to_datetime(matches_df['date'])

# Ensure 'season' is in numeric format, handling non-numeric values
matches_df['season'] = pd.to_numeric(matches_df['season'], errors='coerce')

# Drop rows with NaN values in 'season' after conversion
matches_df = matches_df.dropna(subset=['season'])

# Filter out data before 2015
matches_df = matches_df[matches_df['season'] >= 2015]

# Merge the two datasets
merged_df = deliveries_df.merge(matches_df, left_on='match_id', right_on='id')

# Drop unnecessary columns from merged_df
merged_df = merged_df.drop(['id', 'player_of_match', 'method', 'umpire1', 'umpire2', 'player_dismissed', 'dismissal_kind', 'fielder', 'extras_type', 'city'], axis=1)

# Calculate cumulative runs and wickets for each match and inning
merged_df['cumulative_runs'] = merged_df.groupby(['match_id', 'inning'])['total_runs'].cumsum()
merged_df['cumulative_wickets'] = merged_df.groupby(['match_id', 'inning'])['is_wicket'].cumsum()
merged_df = merged_df.dropna()

# Select relevant columns for features and target
features = merged_df[[ 'venue', 'team1', 'team2', 'over', 'cumulative_runs', 'cumulative_wickets']]
target_runs = merged_df.groupby(['match_id', 'inning'])['cumulative_runs'].transform('max')
target_win = merged_df['winner']

# Drop duplicates to keep one row per over and ball combination
features = features.drop_duplicates(subset=[  'venue', 'team1', 'team2', 'over'])

# Ensure the targets align with the features
target_runs = target_runs.loc[features.index]
target_win = target_win.loc[features.index]

# One-hot encode categorical features
features = pd.get_dummies(features, columns=[ 'venue', 'team1', 'team2'])

# Display unique team names, seasons, locations, and stadiums
unique_teams = pd.unique(matches_df[['team1', 'team2']].values.ravel('K'))
unique_seasons = matches_df['season'].unique()
# unique_locations = matches_df['city'].unique()
unique_stadiums = matches_df['venue'].unique()

print("Unique Teams:\n", unique_teams)
print("Unique Seasons:\n", unique_seasons)
# print("Unique Locations:\n", unique_locations)
print("Unique Stadiums:\n", unique_stadiums)


Unique Teams:
 ['Kolkata Knight Riders' 'Chennai Super Kings' 'Kings XI Punjab'
 'Delhi Daredevils' 'Mumbai Indians' 'Royal Challengers Bangalore'
 'Rajasthan Royals' 'Sunrisers Hyderabad' 'Gujarat Lions'
 'Rising Pune Supergiants' 'Rising Pune Supergiant' 'Delhi Capitals'
 'Punjab Kings' 'Lucknow Super Giants' 'Gujarat Titans'
 'Royal Challengers Bengaluru']
Unique Seasons:
 [2015. 2016. 2017. 2018. 2019. 2021. 2022. 2023. 2024.]
Unique Stadiums:
 ['Eden Gardens' 'MA Chidambaram Stadium, Chepauk'
 'Maharashtra Cricket Association Stadium' 'Feroz Shah Kotla'
 'Wankhede Stadium' 'M Chinnaswamy Stadium' 'Sardar Patel Stadium, Motera'
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium'
 'Punjab Cricket Association Stadium, Mohali'
 'Rajiv Gandhi International Stadium, Uppal' 'Brabourne Stadium'
 'Shaheed Veer Narayan Singh International Stadium'
 'JSCA International Stadium Complex'
 'Punjab Cricket Association IS Bindra Stadium, Mohali'
 'Saurashtra Cricket Association Stadium' 'Green

In [30]:
deliveries_df

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260915,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9,5,SS Iyer,AK Markram,VR Iyer,1,0,1,,0,,,
260916,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9,6,VR Iyer,AK Markram,SS Iyer,1,0,1,,0,,,
260917,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10,1,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,,0,,,
260918,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10,2,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,,0,,,


In [28]:
features


Unnamed: 0,over,cumulative_runs,cumulative_wickets,venue_Arun Jaitley Stadium,"venue_Arun Jaitley Stadium, Delhi","venue_Barsapara Cricket Stadium, Guwahati","venue_Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow",venue_Brabourne Stadium,"venue_Brabourne Stadium, Mumbai","venue_Dr DY Patil Sports Academy, Mumbai",...,team2_Kolkata Knight Riders,team2_Lucknow Super Giants,team2_Mumbai Indians,team2_Punjab Kings,team2_Rajasthan Royals,team2_Rising Pune Supergiant,team2_Rising Pune Supergiants,team2_Royal Challengers Bangalore,team2_Royal Challengers Bengaluru,team2_Sunrisers Hyderabad
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,1,8,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
12,2,13,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
18,3,21,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
24,4,29,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137417,14,90,8,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
137423,15,94,8,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
137429,16,99,8,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
137435,17,109,8,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [9]:
target_win

0         Kolkata Knight Riders
1         Kolkata Knight Riders
2         Kolkata Knight Riders
3         Kolkata Knight Riders
4         Kolkata Knight Riders
                  ...          
137441    Kolkata Knight Riders
137442    Kolkata Knight Riders
137443    Kolkata Knight Riders
137450    Kolkata Knight Riders
137457    Kolkata Knight Riders
Name: winner, Length: 59008, dtype: object

In [31]:
target_runs

0         168
6         168
12        168
18        168
24        168
         ... 
137417    113
137423    113
137429    113
137435    113
137441    113
Name: cumulative_runs, Length: 9103, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier



non_numeric_cols = features.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print(f"Non-numeric columns found: {non_numeric_cols}")
else:
    print("All columns are numeric.")

X_train_runs, X_test_runs, y_train_runs, y_test_runs = train_test_split(features, target_runs, test_size=0.2, random_state=42)

X_train_win, X_test_win, y_train_win, y_test_win = train_test_split(features, target_win, test_size=0.2, random_state=42)

lr = LinearRegression()
rf_reg = RandomForestRegressor(random_state=42)
# svr = SVR(kernel='linear')
knn_reg = KNeighborsRegressor()

rf_clf = RandomForestClassifier(random_state=42)
# svc = SVC(gamma='auto')
knn_clf = KNeighborsClassifier()

lr.fit(X_train_runs, y_train_runs)
print('Linear Regression training done')
rf_reg.fit(X_train_runs, y_train_runs)
print('Random Forest Regressor training done')
# svr.fit(X_train_runs, y_train_runs)
# print('SVR training done')
knn_reg.fit(X_train_runs, y_train_runs)
print('kNN Regressor training done')

rf_clf.fit(X_train_win, y_train_win)
print('Random Forest Classifier training done')
# svc.fit(X_train_win, y_train_win)
# print('SVC training done')
knn_clf.fit(X_train_win, y_train_win)
print('kNN Classifier training done')




All columns are numeric.
Linear Regression training done
Random Forest Regressor training done
kNN Regressor training done
Random Forest Classifier training done
kNN Classifier training done


In [26]:

y_pred_lr = lr.predict(X_test_runs)
y_pred_rf_reg = rf_reg.predict(X_test_runs)
# y_pred_svr = svr.predict(X_test_runs)
y_pred_knn_reg = knn_reg.predict(X_test_runs)

y_pred_rf_clf = rf_clf.predict(X_test_win)
# y_pred_svc = svc.predict(X_test_win)
y_pred_knn_clf = knn_clf.predict(X_test_win)

y_pred_lr = np.maximum(y_pred_lr, 0)
# y_pred_svr = np.maximum(y_pred_svr, 0)
y_pred_knn_reg = np.maximum(y_pred_knn_reg, 0)

mae_lr = mean_absolute_error(y_test_runs, y_pred_lr)

mse_lr = mean_squared_error(y_test_runs, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)

mae_rf_reg = mean_absolute_error(y_test_runs, y_pred_rf_reg)
mse_rf_reg = mean_squared_error(y_test_runs, y_pred_rf_reg)
rmse_rf_reg = np.sqrt(mse_rf_reg)

# mae_svr = mean_absolute_error(y_test_runs, y_pred_svr)
# mse_svr = mean_squared_error(y_test_runs, y_pred_svr)
# rmse_svr = np.sqrt(mse_svr)

mae_knn_reg = mean_absolute_error(y_test_runs, y_pred_knn_reg)
mse_knn_reg = mean_squared_error(y_test_runs, y_pred_knn_reg)
rmse_knn_reg = np.sqrt(mse_knn_reg)

accuracy_rf_clf = accuracy_score(y_test_win, y_pred_rf_clf)
# accuracy_svc = accuracy_score(y_test_win, y_pred_svc)

accuracy_knn_clf = accuracy_score(y_test_win, y_pred_knn_clf)

conf_matrix_rf_clf = confusion_matrix(y_test_win, y_pred_rf_clf)
# conf_matrix_svc = confusion_matrix(y_test_win, y_pred_svc)
conf_matrix_knn_clf = confusion_matrix(y_test_win, y_pred_knn_clf)

print(f'Linear Regression - MAE: {mae_lr}, MSE: {mse_lr}, RMSE: {rmse_lr}')
print(f'Random Forest Regressor - MAE: {mae_rf_reg}, MSE: {mse_rf_reg}, RMSE: {rmse_rf_reg}')
# print(f'SVR - MAE: {mae_svr}, MSE: {mse_svr}, RMSE: {rmse_svr}')
print(f'kNN Regressor - MAE: {mae_knn_reg}, MSE: {mse_knn_reg}, RMSE: {rmse_knn_reg}')
print(f'Random Forest Classifier - Accuracy: {accuracy_rf_clf}')
# print(f'SVC - Accuracy: {accuracy_svc}')
print(f'kNN Classifier - Accuracy: {accuracy_knn_clf}')

print(f'Confusion Matrix for Random Forest Classifier:\n{conf_matrix_rf_clf}')
# print(f'Confusion Matrix for SVC:\n{conf_matrix_svc}')
print(f'Confusion Matrix for kNN Classifier:\n{conf_matrix_knn_clf}')


Linear Regression - MAE: 15.281909800579523, MSE: 414.7747563381979, RMSE: 20.366019648870957
Random Forest Regressor - MAE: 11.948248215266338, MSE: 268.6551556287754, RMSE: 16.390703329289302
kNN Regressor - MAE: 17.63338824821527, MSE: 563.9879626578803, RMSE: 23.74843074095382
Random Forest Classifier - Accuracy: 1.0
kNN Classifier - Accuracy: 0.30807248764415157
Confusion Matrix for Random Forest Classifier:
[[239   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 140   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  58   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  43   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  92   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  70   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 188   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  90   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0 172   0   0  

In [27]:
import joblib

# Save models for score prediction
joblib.dump(lr, 'linear_regression_model.pkl')
joblib.dump(rf_reg, 'random_forest_regressor_model.pkl')
# joblib.dump(svr, 'svr_model.pkl')
joblib.dump(knn_reg, 'knn_regressor_model.pkl')

# Save models for win prediction
joblib.dump(rf_clf, 'random_forest_classifier_model.pkl')
# joblib.dump(svc, 'svc_model.pkl')
joblib.dump(knn_clf, 'knn_classifier_model.pkl')
joblib.dump(features, 'features.pkl')


['features.pkl']