<a href="https://colab.research.google.com/github/armogan786/ipl-win/blob/main/ipl_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install xgboost scikit-learn matplotlib seaborn pandas --quiet

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier



# Load data
matches = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/matches.csv')
deliveries = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/deliveries.csv')

# Filter deliveries from matches with result
matches = matches.dropna(subset=['winner'])
match_ids = matches['id'].unique()
deliveries = deliveries[deliveries['match_id'].isin(match_ids)]

# Merge match winner into deliveries
match_winner = matches[['id', 'winner']]
match_winner.columns = ['match_id', 'match_winner']
deliveries = deliveries.merge(match_winner, on='match_id')

# Keep only 2nd innings (for prediction based on chase)
second_innings = deliveries[deliveries['inning'] == 2].copy()

# Sort to calculate features over each delivery
second_innings = second_innings.sort_values(['match_id', 'over', 'ball'])

# Calculate cumulative runs & wickets
second_innings['current_score'] = second_innings.groupby('match_id')['total_runs'].cumsum()
second_innings['ball_number'] = second_innings.groupby('match_id').cumcount() + 1
second_innings['wicket'] = second_innings['player_dismissed'].notna().astype(int)
second_innings['wickets_fallen'] = second_innings.groupby('match_id')['wicket'].cumsum()

# Get target runs
target = deliveries[deliveries['inning'] == 1].groupby('match_id')['total_runs'].sum().reset_index()
target.columns = ['match_id', 'target']
second_innings = second_innings.merge(target, on='match_id')

# Calculate remaining runs and balls
second_innings['runs_left'] = second_innings['target'] - second_innings['current_score']
second_innings['balls_left'] = 120 - second_innings['ball_number']

# Calculate run rate features
second_innings['run_rate'] = second_innings['current_score'] / (second_innings['ball_number'] / 6)
second_innings['required_run_rate'] = (second_innings['runs_left'] / second_innings['balls_left']) * 6
second_innings['required_run_rate'].replace([np.inf, -np.inf], np.nan, inplace=True)
second_innings = second_innings.dropna()

# Final label: 1 if batting_team == match_winner
second_innings['batting_team_won'] = (second_innings['batting_team'] == second_innings['match_winner']).astype(int)

# Select features
features = second_innings[['runs_left', 'balls_left', 'wickets_fallen', 'run_rate', 'required_run_rate']]
labels = second_innings['batting_team_won']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=500)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

# Evaluation
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print("Logistic Regression ROC AUC:", roc_auc_score(y_test, lr_model.predict_proba(X_test)[:,1]))

print("\nXGBoost Accuracy:", accuracy_score(y_test, xgb_preds))
print("XGBoost ROC AUC:", roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1]))

print("\nClassification Report (XGBoost):")
print(classification_report(y_test, xgb_preds))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  second_innings['required_run_rate'].replace([np.inf, -np.inf], np.nan, inplace=True)



Logistic Regression Accuracy: 0.7903494176372712
Logistic Regression ROC AUC: 0.8796490370059852

XGBoost Accuracy: 0.762063227953411
XGBoost ROC AUC: 0.8517722816087939

Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.78      0.85      0.81       367
           1       0.72      0.63      0.67       234

    accuracy                           0.76       601
   macro avg       0.75      0.74      0.74       601
weighted avg       0.76      0.76      0.76       601



Parameters: { "use_label_encoder" } are not used.



In [2]:
# Save model results to CSV
summary_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'XGBoost'],
    'Accuracy': [accuracy_score(y_test, lr_preds), accuracy_score(y_test, xgb_preds)],
    'ROC_AUC': [roc_auc_score(y_test, lr_model.predict_proba(X_test)[:,1]),
                roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:,1])]
})
summary_df.to_csv('model_comparison.csv', index=False)
files.download('model_comparison.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
import joblib

# Save the model
joblib.dump(xgb_model, 'xgb_model.pkl')


['xgb_model.pkl']

In [6]:
from google.colab import files

# Download the file to your computer
files.download('xgb_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>