In [1]:
## 1. Setup & Library Imports

import pandas as pd
import numpy as np
import pickle # For saving model artifacts
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print("All required libraries imported.")

All required libraries imported.


In [2]:
## 2. Data Loading and Filtering

DATA_FILE = 'ball_by_ball_it20.csv'

try:
    # Load data
    df = pd.read_csv(DATA_FILE)
    
    # Drop redundant column
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    
    print(f"Dataset '{DATA_FILE}' loaded successfully. Shape: {df.shape}")
    
    # Filter for 2nd innings data (where the chase occurs)
    df_2nd_innings = df[df['Innings'] == 2].copy()
    
    # Drop rows where critical target features are missing
    df_2nd_innings.dropna(subset=['Runs to Get', 'Balls Remaining', 'Chased Successfully'], inplace=True)
    
    print(f"Filtered 2nd Innings Data Shape: {df_2nd_innings.shape}")
    display(df_2nd_innings.head())
    
except FileNotFoundError:
    print(f"Error: Dataset '{DATA_FILE}' not found. Please check the file path.")
    df_2nd_innings = None

Dataset 'ball_by_ball_it20.csv' loaded successfully. Shape: (425119, 34)
Filtered 2nd Innings Data Shape: (200304, 34)


Unnamed: 0,Match ID,Date,Venue,Bat First,Bat Second,Innings,Over,Ball,Batter,Non Striker,...,Winner,Chased Successfully,Total Batter Runs,Total Non Striker Runs,Batter Balls Faced,Non Striker Balls Faced,Player Out Runs,Player Out Balls Faced,Bowler Runs Conceded,Valid Ball
124,1339605,2023-03-26,SuperSport Park,West Indies,South Africa,2,1,1,Q de Kock,RR Hendricks,...,South Africa,1,4,0,1,0,,,4,1
125,1339605,2023-03-26,SuperSport Park,West Indies,South Africa,2,1,2,Q de Kock,RR Hendricks,...,South Africa,1,8,0,2,0,,,4,1
126,1339605,2023-03-26,SuperSport Park,West Indies,South Africa,2,1,3,Q de Kock,RR Hendricks,...,South Africa,1,8,0,3,0,,,0,1
127,1339605,2023-03-26,SuperSport Park,West Indies,South Africa,2,1,4,Q de Kock,RR Hendricks,...,South Africa,1,8,0,4,0,,,0,1
128,1339605,2023-03-26,SuperSport Park,West Indies,South Africa,2,1,5,Q de Kock,RR Hendricks,...,South Africa,1,9,0,5,0,,,1,1


In [3]:
## 3. Feature Engineering: Run Rates

if df_2nd_innings is not None:
    # 1. Overs Faced
    df_2nd_innings['overs_faced'] = 120 - df_2nd_innings['Balls Remaining']
    
    # 2. Current Run Rate (CRR)
    # Formula: (Total Runs / Balls Faced) * 6
    df_2nd_innings['current_run_rate'] = np.where(
        df_2nd_innings['overs_faced'] > 0, 
        (df_2nd_innings['Innings Runs'] / df_2nd_innings['overs_faced']) * 6, 
        0
    )
    
    # 3. Required Run Rate (RRR)
    # Formula: (Runs to Get / Balls Remaining) * 6
    df_2nd_innings['req_run_rate'] = np.where(
        df_2nd_innings['Balls Remaining'] > 0, 
        (df_2nd_innings['Runs to Get'] / df_2nd_innings['Balls Remaining']) * 6, 
        0
    )
    
    print("Feature engineering complete (overs_faced, CRR, RRR created).")
    display(df_2nd_innings[['overs_faced', 'current_run_rate', 'req_run_rate', 'Chased Successfully']].head())

Feature engineering complete (overs_faced, CRR, RRR created).


Unnamed: 0,overs_faced,current_run_rate,req_run_rate,Chased Successfully
124,1,24.0,12.857143,1
125,2,24.0,12.762712,1
126,3,16.0,12.871795,1
127,4,12.0,12.982759,1
128,5,10.8,13.043478,1


In [4]:
## 4. Feature Selection and Train/Test Split

if df_2nd_innings is not None:
    # Define features and target
    categorical_features = ['Venue', 'Bat First', 'Bat Second']
    numerical_features = ['Runs to Get', 'Balls Remaining', 'Innings Wickets', 'current_run_rate', 'req_run_rate']
    
    X = df_2nd_innings[categorical_features + numerical_features]
    y = df_2nd_innings['Chased Successfully']
    
    # Split data (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y # Important for imbalanced classification
    )
    
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")

X_train shape: (160243, 8)
X_test shape: (40061, 8)


In [5]:
## 5. Preprocessing Pipeline: One-Hot Encoding

if 'X_train' in locals():
    # Define the preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ],
        remainder='passthrough' # Keep numerical features as is
    )
    
    # Fit the preprocessor on the training data
    X_train_processed = preprocessor.fit_transform(X_train)
    
    # Get the feature names after transformation for inspection
    feature_names = preprocessor.get_feature_names_out()
    
    print("Preprocessor fitted on training data.")
    print(f"Processed X_train shape: {X_train_processed.shape}")
    print(f"Example transformed feature names: {feature_names[:10]}...")

Preprocessor fitted on training data.
Processed X_train shape: (160243, 379)
Example transformed feature names: ['cat__Venue_AMI Stadium' 'cat__Venue_Adelaide Oval'
 'cat__Venue_Al Amerat Cricket Ground Oman Cricket (Ministry Turf 1)'
 'cat__Venue_Al Amerat Cricket Ground Oman Cricket (Ministry Turf 2)'
 'cat__Venue_Amini Park' 'cat__Venue_Arnos Vale Ground'
 'cat__Venue_Arun Jaitley Stadium' 'cat__Venue_Barabati Stadium'
 'cat__Venue_Barsapara Cricket Stadium' 'cat__Venue_Bay Oval']...


In [6]:
## 6. Model Training

if 'X_train_processed' in locals():
    # Initialize the Logistic Regression model
    model = LogisticRegression(
        solver='liblinear', 
        random_state=42, 
        max_iter=1000,
        class_weight='balanced' # Added for potentially imbalanced classes
    )
    
    # Train the model
    print("Training Logistic Regression model...")
    model.fit(X_train_processed, y_train)
    print("Model training complete.")

Training Logistic Regression model...
Model training complete.


In [7]:
## 7. Model Evaluation

if 'model' in locals():
    # 1. Transform the test data using the fitted preprocessor
    X_test_processed = preprocessor.transform(X_test)
    
    # 2. Make predictions and get probabilities
    y_pred = model.predict(X_test_processed)
    # Probability for the positive class (1: Chased Successfully)
    y_proba = model.predict_proba(X_test_processed)[:, 1] 
    
    # 3. Calculate all metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_proba)
    }
    
    print("--- Model Performance Metrics ---")
    
    # Display metrics in a formatted table
    metrics_df = pd.DataFrame(metrics.items(), columns=['Metric', 'Score'])
    metrics_df['Score'] = metrics_df['Score'].map('{:.4f}'.format)
    display(metrics_df)
    
    print(f"\nModel Accuracy: {metrics['Accuracy']:.4f}")

--- Model Performance Metrics ---


Unnamed: 0,Metric,Score
0,Accuracy,0.8919
1,Precision,0.8752
2,Recall,0.8971
3,F1-Score,0.886
4,ROC-AUC,0.9609



Model Accuracy: 0.8919


In [8]:
## 8. Saving Model Artifacts

if 'model' in locals() and 'preprocessor' in locals() and 'metrics' in locals():
    
    # 1. Save the trained model
    with open('cricket_predictor_model.pkl', 'wb') as file:
        pickle.dump(model, file)
        
    # 2. Save the preprocessor (Crucial for transforming new data correctly)
    with open('model_preprocessor.pkl', 'wb') as file:
        pickle.dump(preprocessor, file)
        
    # 3. Save the performance metrics
    with open('model_metrics.pkl', 'wb') as file:
        pickle.dump(metrics, file)
        
    print("\n--- Training Pipeline Complete ---")
    print("Model saved as cricket_predictor_model.pkl")
    print("Preprocessor (with categorical mappings) saved as model_preprocessor.pkl")
    print("Metrics saved as model_metrics.pkl")


--- Training Pipeline Complete ---
Model saved as cricket_predictor_model.pkl
Preprocessor (with categorical mappings) saved as model_preprocessor.pkl
Metrics saved as model_metrics.pkl
