# Player Performance Predictor - Definitive Model Training

**Goal:** Build our final, high-accuracy, format-specific classification models using our definitive strategy: format-specific career stats, the full dataset for each format, and the SMOTE technique to solve class imbalance.

### Step 1: Configuration - CHOOSE THE FORMAT TO TRAIN

**Action:** Change the value of the `MATCH_FORMAT_TO_TRAIN` variable below to either 'T20', 'ODI', or 'Test'. Then, run the entire notebook (`Kernel > Restart & Run All`).

In [None]:
MATCH_FORMAT_TO_TRAIN = "T20" # <-- CHANGE THIS VALUE (e.g., "ODI" or "Test")

print(f"Configuration set to train models for: {MATCH_FORMAT_TO_TRAIN}")

### Step 2: Load Data & Define Custom Bins

In [None]:
import pandas as pd
import numpy as np
import sqlite3
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# --- Load the new, definitive database ---
db_path = '../cricket_final_data_by_format.db'
conn = sqlite3.connect(db_path)
batting_df_full = pd.read_sql_query("SELECT * FROM batting_stats", conn)
bowling_df_full = pd.read_sql_query("SELECT * FROM bowling_stats", conn)
conn.close()

# Filter for the chosen format
batting_df = batting_df_full[batting_df_full['match_type'] == MATCH_FORMAT_TO_TRAIN].copy()
bowling_df = bowling_df_full[bowling_df_full['match_type'] == MATCH_FORMAT_TO_TRAIN].copy()
batting_df['date'] = pd.to_datetime(batting_df['date'])
bowling_df['date'] = pd.to_datetime(bowling_df['date'])
print(f"Data loaded for {MATCH_FORMAT_TO_TRAIN}.")

# --- Define Custom, Format-Specific Bins --- #
if MATCH_FORMAT_TO_TRAIN == 'T20':
    run_bins = [-1, 15, 29, 49, 999]
    run_labels = ['0-15', '16-29', '30-49', '50+']
    wicket_bins = [-1, 1, 3, 99]
    wicket_labels = ['0-1', '2-3', '4+']
elif MATCH_FORMAT_TO_TRAIN == 'ODI':
    run_bins = [-1, 24, 49, 99, 999]
    run_labels = ['0-24', '25-49', '50-99', '100+']
    wicket_bins = [-1, 1, 3, 99]
    wicket_labels = ['0-1', '2-3', '4+']
else: # Test
    run_bins = [-1, 24, 49, 99, 999]
    run_labels = ['0-24', '25-49', '50-99', '100+']
    wicket_bins = [-1, 2, 4, 99]
    wicket_labels = ['0-2', '3-4', '5+']

batting_df['runs_bin'] = pd.cut(batting_df['runs'], bins=run_bins, labels=run_labels)
bowling_df['wickets_bin'] = pd.cut(bowling_df['wickets'], bins=wicket_bins, labels=wicket_labels)
print("Custom performance bins defined and applied.")

## Part A: Training the Batting Classification Model

### Step 3: Feature Engineering for Batting

In [None]:
print("Creating features for batting data...")
# --- Calculate Career Averages & Strike Rates --- #
batting_df['career_avg'] = np.where(batting_df['career_innings'] > 0, batting_df['career_runs'] / batting_df['career_innings'], 0)
batting_df['career_sr'] = np.where(batting_df['career_balls_faced'] > 0, (batting_df['career_runs'] / batting_df['career_balls_faced']) * 100, 0)

# --- Calculate 10-Inning Form --- #
batting_df['form_avg_last_10'] = batting_df.groupby('player')['runs'].transform(lambda x: x.rolling(window=10, min_periods=1).mean().shift(1))
batting_df['form_avg_last_10'] = batting_df['form_avg_last_10'].fillna(0)

# --- Create Contextual Features --- #
match_teams = batting_df_full.groupby('match_id')['team'].unique().apply(list).to_dict()
def find_opposition(row):
    teams_in_match = match_teams.get(row['match_id'])
    if teams_in_match and len(teams_in_match) == 2:
        return teams_in_match[1] if teams_in_match[0] == row['team'] else teams_in_match[0]
    return 'N/A'
batting_df['against_team'] = batting_df.apply(find_opposition, axis=1)

print("Feature engineering complete.")

### Step 4: Prepare Batting Data for Modeling

In [None]:
batting_df.dropna(subset=['runs_bin', 'player', 'venue', 'against_team'], inplace=True)

categorical_features = ['player', 'venue', 'against_team']
numerical_features = ['career_avg', 'career_sr', 'career_innings', 'form_avg_last_10']
target = 'runs_bin'

X = pd.concat([
    pd.get_dummies(batting_df[categorical_features], columns=categorical_features),
    batting_df[numerical_features]
], axis=1)
y = batting_df[target]

print("Batting data prepared.")
print(f"Final shape of {MATCH_FORMAT_TO_TRAIN} batting feature matrix (X):", X.shape)

### Step 5: Train & Evaluate the Batting Classifier (with SMOTE)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original training set shape: {X_train.shape}")
print(f"Original training class distribution:\n{y_train.value_counts(normalize=True)}")

print("\nApplying SMOTE to balance the training data... This may take a moment.")
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print(f"Resampled training set shape: {X_train_res.shape}")
print(f"Resampled training class distribution:\n{y_train_res.value_counts(normalize=True)}")

n_jobs = max(1, os.cpu_count() - 1)
batting_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_jobs, verbose=2)

print(f"\nTraining the {MATCH_FORMAT_TO_TRAIN} batting classifier on the balanced dataset...")
batting_classifier.fit(X_train_res, y_train_res)
print("Training complete.")

predictions = batting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"\n--- {MATCH_FORMAT_TO_TRAIN} Batting Model Evaluation ---")
print(f"Overall Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, predictions, zero_division=0))

### Step 6: Save the Trained Batting Classifier

In [None]:
if not os.path.exists('../models'):
    os.makedirs('../models')

model_filename = f'../models/{MATCH_FORMAT_TO_TRAIN.lower()}_batting_classifier.joblib'
columns_filename = f'../models/{MATCH_FORMAT_TO_TRAIN.lower()}_batting_columns.joblib'

joblib.dump(batting_classifier, model_filename)
joblib.dump(X.columns, columns_filename)

print(f"{MATCH_FORMAT_TO_TRAIN} batting classifier and columns saved successfully.")

---

## Part B: Training the Bowling Classification Model

### Step 7: Feature Engineering for Bowling

In [None]:
print("\nCreating features for bowling data...")
bowling_df['career_bowling_avg'] = np.where(bowling_df['career_wickets'] > 0, bowling_df['career_runs_conceded'] / bowling_df['career_wickets'], 0)
bowling_df['career_bowling_sr'] = np.where(bowling_df['career_wickets'] > 0, bowling_df['career_balls_bowled'] / bowling_df['career_wickets'], 0)
bowling_df['form_wickets_last_10'] = bowling_df.groupby('player')['wickets'].transform(lambda x: x.rolling(window=10, min_periods=1).mean().shift(1))
bowling_df['form_wickets_last_10'] = bowling_df['form_wickets_last_10'].fillna(0)
bowling_df['against_team'] = bowling_df.apply(find_opposition, axis=1)
print("Feature engineering complete.")

### Step 8: Prepare Bowling Data for Modeling

In [None]:
bowling_df.dropna(subset=['wickets_bin', 'player', 'venue', 'against_team'], inplace=True)

categorical_features_bowling = ['player', 'venue', 'against_team']
numerical_features_bowling = ['career_bowling_avg', 'career_bowling_sr', 'form_wickets_last_10']
target_bowling = 'wickets_bin'

X_bowling = pd.concat([
    pd.get_dummies(bowling_df[categorical_features_bowling], columns=categorical_features_bowling),
    bowling_df[numerical_features_bowling]
], axis=1)
y_bowling = bowling_df[target_bowling]

print("Bowling data prepared.")
print(f"Final shape of {MATCH_FORMAT_TO_TRAIN} bowling feature matrix (X):", X_bowling.shape)

### Step 9: Train & Evaluate the Bowling Classifier (with SMOTE)

In [None]:
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_bowling, y_bowling, test_size=0.2, random_state=42, stratify=y_bowling)

print(f"Original training set shape: {X_train_b.shape}")
print(f"Original training class distribution:\n{y_train_b.value_counts(normalize=True)}")

print("\nApplying SMOTE to balance the training data...")
smote_b = SMOTE(random_state=42)
X_train_res_b, y_train_res_b = smote_b.fit_resample(X_train_b, y_train_b)
print(f"Resampled training set shape: {X_train_res_b.shape}")
print(f"Resampled training class distribution:\n{y_train_res_b.value_counts(normalize=True)}")

n_jobs = max(1, os.cpu_count() - 1)
bowling_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_jobs, verbose=2)

print(f"\nTraining the {MATCH_FORMAT_TO_TRAIN} bowling classifier...")
bowling_classifier.fit(X_train_res_b, y_train_res_b)
print("Training complete.")

predictions_b = bowling_classifier.predict(X_test_b)
accuracy_b = accuracy_score(y_test_b, predictions_b)

print(f"\n--- {MATCH_FORMAT_TO_TRAIN} Bowling Model Evaluation ---")
print(f"Overall Accuracy: {accuracy_b * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test_b, predictions_b, zero_division=0))

### Step 10: Save the Trained Bowling Classifier

In [None]:
model_filename_bowling = f'../models/{MATCH_FORMAT_TO_TRAIN.lower()}_bowling_classifier.joblib'
columns_filename_bowling = f'../models/{MATCH_FORMAT_TO_TRAIN.lower()}_bowling_columns.joblib'

joblib.dump(bowling_classifier, model_filename_bowling)
joblib.dump(X_bowling.columns, columns_filename_bowling)

print(f"{MATCH_FORMAT_TO_TRAIN} bowling classifier and columns saved successfully.")