# Player Performance Predictor - Final Model Training

**Goal:** Build our definitive, high-accuracy, format-specific **classification** models based on our final agreed-upon strategy.

### Step 1: Configuration - CHOOSE THE FORMAT TO TRAIN

**Action:** Change the value of the `MATCH_FORMAT_TO_TRAIN` variable below to either 'T20', 'ODI', or 'Test'. Then, run the entire notebook (`Kernel > Restart & Run All`).

In [None]:
MATCH_FORMAT_TO_TRAIN = "T20" # <-- CHANGE THIS VALUE (e.g., "ODI" or "Test")

print(f"Configuration set to train models for: {MATCH_FORMAT_TO_TRAIN}")

### Step 2: Load Data & Define Custom Bins

In [None]:
import pandas as pd
import numpy as np
import sqlite3
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# --- Load the new, enriched database ---
db_path = '../cricket_final_data.db'
conn = sqlite3.connect(db_path)
batting_df_full = pd.read_sql_query("SELECT * FROM batting_stats", conn)
bowling_df_full = pd.read_sql_query("SELECT * FROM bowling_stats", conn)
conn.close()

# Filter for the chosen format
batting_df = batting_df_full[batting_df_full['match_type'] == MATCH_FORMAT_TO_TRAIN].copy()
bowling_df = bowling_df_full[bowling_df_full['match_type'] == MATCH_FORMAT_TO_TRAIN].copy()
batting_df['date'] = pd.to_datetime(batting_df['date'])
bowling_df['date'] = pd.to_datetime(bowling_df['date'])
print(f"Data loaded for {MATCH_FORMAT_TO_TRAIN}.")

# --- Define Custom, Format-Specific Bins --- #
if MATCH_FORMAT_TO_TRAIN == 'T20':
    run_bins = [-1, 15, 29, 49, 999]
    run_labels = ['0-15', '16-29', '30-49', '50+']
    wicket_bins = [-1, 1, 3, 99]
    wicket_labels = ['0-1', '2-3', '4+']
elif MATCH_FORMAT_TO_TRAIN == 'ODI':
    run_bins = [-1, 24, 49, 99, 999]
    run_labels = ['0-24', '25-49', '50-99', '100+']
    wicket_bins = [-1, 1, 3, 99]
    wicket_labels = ['0-1', '2-3', '4+']
else: # Test
    run_bins = [-1, 24, 49, 99, 999]
    run_labels = ['0-24', '25-49', '50-99', '100+']
    wicket_bins = [-1, 2, 4, 99]
    wicket_labels = ['0-2', '3-4', '5+']

# Apply the bins to create our target variable
batting_df['runs_bin'] = pd.cut(batting_df['runs'], bins=run_bins, labels=run_labels)
bowling_df['wickets_bin'] = pd.cut(bowling_df['wickets'], bins=wicket_bins, labels=wicket_labels)
print("Custom performance bins defined and applied.")

## Part A: Training the Batting Classification Model

### Step 3: Feature Engineering for Batting

In [None]:
print("Creating features for batting data...")
# --- Calculate Career Averages & Strike Rates --- #
batting_df['career_avg'] = np.where(batting_df['career_innings'] > 0, batting_df['career_runs'] / batting_df['career_innings'], 0)
batting_df['career_sr'] = np.where(batting_df['career_balls_faced'] > 0, (batting_df['career_runs'] / batting_df['career_balls_faced']) * 100, 0)

# --- Calculate 10-Inning Form --- #
batting_df['form_avg_last_10'] = batting_df.groupby('player')['runs'].transform(lambda x: x.rolling(window=10, min_periods=1).mean().shift(1))
batting_df['form_avg_last_10'] = batting_df['form_avg_last_10'].fillna(0)

# --- Create Contextual Features --- #
match_teams = batting_df_full.groupby('match_id')['team'].unique().apply(list).to_dict()
def find_opposition(row):
    teams_in_match = match_teams.get(row['match_id'])
    if teams_in_match and len(teams_in_match) == 2:
        return teams_in_match[1] if teams_in_match[0] == row['team'] else teams_in_match[0]
    return 'N/A'
batting_df['against_team'] = batting_df.apply(find_opposition, axis=1)

print("Feature engineering complete.")

### Step 4: Prepare Batting Data for Modeling

In [None]:
# Drop rows where the bin is unknown or there are missing values in key columns
batting_df.dropna(subset=['runs_bin', 'player', 'venue', 'against_team'], inplace=True)

# Define our features and target
categorical_features = ['player', 'venue', 'against_team']
numerical_features = ['career_avg', 'career_sr', 'career_innings', 'form_avg_last_10']
target = 'runs_bin'

X = pd.concat([
    pd.get_dummies(batting_df[categorical_features], columns=categorical_features),
    batting_df[numerical_features]
], axis=1)
y = batting_df[target]

print("Batting data prepared.")
print(f"Final shape of {MATCH_FORMAT_TO_TRAIN} batting feature matrix (X):", X.shape)

### Step 5: Train & Evaluate the Batting Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use one less than the total number of CPU cores
n_jobs = max(1, os.cpu_count() - 1)

batting_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=n_jobs, verbose=2)

print(f"Training the {MATCH_FORMAT_TO_TRAIN} batting classifier using {n_jobs} cores...")
batting_classifier.fit(X_train, y_train)
print("Training complete.")

# Evaluation
predictions = batting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"\n--- {MATCH_FORMAT_TO_TRAIN} Batting Model Evaluation ---")
print(f"Overall Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, predictions, zero_division=0))

### Step 6: Save the Trained Batting Classifier

In [None]:
if not os.path.exists('../models'):
    os.makedirs('../models')

model_filename = f'../models/{MATCH_FORMAT_TO_TRAIN.lower()}_batting_classifier.joblib'
columns_filename = f'../models/{MATCH_FORMAT_TO_TRAIN.lower()}_batting_columns.joblib'

joblib.dump(batting_classifier, model_filename)
joblib.dump(X.columns, columns_filename)

print(f"{MATCH_FORMAT_TO_TRAIN} batting classifier and columns saved successfully.")

## (Optional) Part B is structured identically for the bowling model. 