In [1]:
import pandas as pd

df = pd.read_csv("final_dataset_with_pitch.csv")

print("Unique Batters:")
print(df['batter'].unique()[:20])  # First 20 batters

print("\nUnique Bowlers:")
print(df['bowler'].unique()[:20])  # First 20 bowlers

print("\nUnique Teams:")
print(df['batting_team'].unique())

print("\nUnique Venues:")
print(df['venue'].unique())

print("\nUnique Pitch Types:")
print(df['pitch_type'].unique())

Unique Batters:
['SC Ganguly' 'BB McCullum' 'RT Ponting' 'DJ Hussey' 'Mohammad Hafeez'
 'R Dravid' 'W Jaffer' 'V Kohli' 'JH Kallis' 'CL White' 'MV Boucher'
 'B Akhil' 'AA Noffke' 'P Kumar' 'Z Khan' 'SB Joshi' 'PA Patel'
 'ML Hayden' 'MEK Hussey' 'MS Dhoni']

Unique Bowlers:
['P Kumar' 'Z Khan' 'AA Noffke' 'JH Kallis' 'SB Joshi' 'CL White'
 'AB Dinda' 'I Sharma' 'AB Agarkar' 'SC Ganguly' 'LR Shukla' 'B Lee'
 'S Sreesanth' 'JR Hopes' 'IK Pathan' 'K Goel' 'PP Chawla' 'WA Mota'
 'JDP Oram' 'MS Gony']

Unique Teams:
['Kolkata Knight Riders' 'Royal Challengers Bangalore'
 'Chennai Super Kings' 'Kings XI Punjab' 'Rajasthan Royals'
 'Delhi Daredevils' 'Mumbai Indians' 'Deccan Chargers'
 'Kochi Tuskers Kerala' 'Pune Warriors' 'Sunrisers Hyderabad'
 'Rising Pune Supergiants' 'Gujarat Lions' 'Rising Pune Supergiant'
 'Delhi Capitals' 'Punjab Kings' 'Lucknow Super Giants' 'Gujarat Titans'
 'Royal Challengers Bengaluru']

Unique Venues:
['M Chinnaswamy Stadium' 'Punjab Cricket Association Stadium, 

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, regularizers
from tensorflow.keras.losses import Huber
import joblib

# Load dataset
df = pd.read_csv("final_dataset_with_pitch.csv")

## ENHANCED FEATURE ENGINEERING (Keep your original logic)
df = df.sort_values(by=['match_id', 'inning', 'over', 'ball'])

# Basic cumulative features
df['cumulative_runs'] = df.groupby(['match_id', 'inning'])['total_runs'].cumsum()
df['cumulative_wickets'] = df.groupby(['match_id', 'inning'])['is_wicket'].cumsum()

# Enhanced features with temporal context
df['run_rate'] = df['cumulative_runs'] / (df['over'] + 0.1)
df['strike_rate'] = df.groupby(['match_id', 'inning', 'batter'])['total_runs'].cumsum() / \
                   df.groupby(['match_id', 'inning', 'batter']).cumcount().add(1)

# Player performance metrics with recent form
for player_col in ['batter', 'bowler']:
    df[f'{player_col}_avg'] = df.groupby(player_col)['total_runs'].transform(
        lambda x: x.expanding().mean().shift(1))
    df[f'{player_col}_last5'] = df.groupby(player_col)['total_runs'].transform(
        lambda x: x.rolling(30).mean())

# Partnership dynamics
df['partnership_runs'] = df.groupby(['match_id', 'inning', 'batter', 'non_striker'])['total_runs'].cumsum()
df['partnership_balls'] = df.groupby(['match_id', 'inning', 'batter', 'non_striker']).cumcount() + 1
df['partnership_momentum'] = df['partnership_runs'] / df['partnership_balls']

# Bowler fatigue and recent performance
df['bowler_balls_bowled'] = df.groupby(['match_id', 'inning', 'bowler']).cumcount() + 1
df['bowler_recent_economy'] = df.groupby(['match_id', 'inning', 'bowler'])['total_runs'].rolling(12, min_periods=1).mean().reset_index(level=[0,1,2], drop=True)

# Match phase and pressure indicators
def get_phase(over):
    if over <= 6: return 'Powerplay'
    elif over <= 10: return 'Middle1'
    elif over <= 15: return 'Middle2'
    return 'Death'

df['phase'] = df['over'].apply(get_phase)
df['is_death_over'] = (df['over'] >= 16).astype(int)
df['runs_last_5_overs'] = df.groupby(['match_id', 'inning'])['total_runs'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)
df['wickets_last_5_overs'] = df.groupby(['match_id', 'inning'])['is_wicket'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)

# IMPROVED TARGET ENGINEERING for next over (keep your original)
def calculate_next_over_runs_improved(group):
    group = group.sort_values(['over', 'ball'])
    over_runs = []
    current_over = -1
    current_runs = 0
    
    for idx, row in group.iterrows():
        if row['over'] != current_over:
            if current_over >= 0:
                over_runs.append((current_over, current_runs))
            current_over = row['over']
            current_runs = row['total_runs']
        else:
            current_runs += row['total_runs']
    
    if current_over >= 0:
        over_runs.append((current_over, current_runs))
    
    over_to_runs = dict(over_runs)
    group['next_over_runs'] = group['over'].apply(
        lambda x: over_to_runs.get(x + 1, np.nan)
    )
    
    return group

df = df.groupby(['match_id', 'inning'], group_keys=False).apply(calculate_next_over_runs_improved)

# 🔧 FIXED: No data leakage for final score prediction
print("🔧 Creating proper final score targets without data leakage...")

# Calculate actual final scores for each match-inning
actual_final_scores = df.groupby(['match_id', 'inning'])['total_runs'].sum().reset_index()
actual_final_scores.columns = ['match_id', 'inning', 'actual_final_score']

# Merge to get the actual final score
df = pd.merge(df, actual_final_scores, on=['match_id', 'inning'])

# 🎯 KEY FIX: Create target as REMAINING RUNS instead of final score
df['remaining_runs'] = df['actual_final_score'] - df['cumulative_runs']

print(f"✅ Remaining runs range: {df['remaining_runs'].min():.1f} to {df['remaining_runs'].max():.1f}")
print(f"✅ Average remaining runs: {df['remaining_runs'].mean():.1f}")

# Filter out completed innings (remaining_runs <= 0)
df = df[df['remaining_runs'] > 0]
print(f"✅ Samples after filtering: {len(df)}")

# Match state features (keep your original logic)
df['wickets_remaining'] = 10 - df['cumulative_wickets']
df['resources_remaining'] = (20 - df['over']) * (df['wickets_remaining'] / 10)

# ⚠️ COMPLETELY REMOVE FEATURES THAT CAUSE DATA LEAKAGE
df['overs_remaining'] = 20 - df['over']

# 🚫 DO NOT calculate required_run_rate - it causes data leakage
# 🚫 DO NOT calculate run_rate_delta - it depends on required_run_rate
# These features will be excluded from training

# Additional features for next over prediction (keep original)
df['balls_faced_current_over'] = df.groupby(['match_id', 'inning', 'over']).cumcount() + 1
df['runs_current_over'] = df.groupby(['match_id', 'inning', 'over'])['total_runs'].cumsum()
df['current_over_rate'] = df['runs_current_over'] / df['balls_faced_current_over']

# Historical over performance by phase
df['phase_avg_runs'] = df.groupby(['phase'])['total_runs'].transform('mean')

# DATA CLEANING (keep your original approach)
df.fillna({
    'batter_avg': df['total_runs'].mean(),
    'bowler_avg': df['total_runs'].mean(),
    'next_over_runs': df.groupby('phase')['total_runs'].transform('mean'),
    'bowler_recent_economy': df['total_runs'].mean(),
    'batter_last5': df['total_runs'].mean(),
    'bowler_last5': df['total_runs'].mean()
}, inplace=True)

# Remove outliers for remaining runs
q1 = df['remaining_runs'].quantile(0.01)
q99 = df['remaining_runs'].quantile(0.99)
df = df[(df['remaining_runs'] >= q1) & (df['remaining_runs'] <= q99)]

print(f"✅ Final samples: {len(df)}")

# More conservative outlier removal for next over (keep original)
next_over_valid = df['next_over_runs'].notna()
if next_over_valid.sum() > 0:
    q1_over = df.loc[next_over_valid, 'next_over_runs'].quantile(0.05)
    q95_over = df.loc[next_over_valid, 'next_over_runs'].quantile(0.95)
    df = df[(df['next_over_runs'] >= q1_over) & (df['next_over_runs'] <= q95_over)]

# FEATURE SELECTION - REMOVED DATA LEAKAGE FEATURES
features = [
    'venue', 'batting_team', 'bowling_team', 'batter', 'bowler',
    'over', 'cumulative_runs', 'cumulative_wickets', 'phase', 'pitch_type',
    'run_rate', 'strike_rate', 'batter_avg', 'bowler_avg',
    'partnership_runs', 'partnership_balls', 'bowler_balls_bowled',
    'bowler_recent_economy', 'wickets_remaining', 'resources_remaining',
    'is_death_over', 'runs_last_5_overs', 'wickets_last_5_overs',
    # 🚫 REMOVED: 'required_run_rate', 'run_rate_delta' - cause data leakage
    'partnership_momentum',
    'batter_last5', 'bowler_last5', 'balls_faced_current_over',
    'runs_current_over', 'current_over_rate', 'phase_avg_runs',
    'overs_remaining'
]

X = df[features]
y_remaining = df['remaining_runs']  # 🎯 NEW TARGET: remaining runs
y_next_over = df['next_over_runs']

# Remove rows where next_over_runs is NaN
valid_indices = y_next_over.notna()
X = X[valid_indices]
y_remaining = y_remaining[valid_indices]
y_next_over = y_next_over[valid_indices]

print(f"✅ Final feature matrix: {X.shape}")
print(f"✅ Remaining runs - Mean: {y_remaining.mean():.1f}, Std: {y_remaining.std():.1f}")

# PREPROCESSING (keep original)
categorical_features = ['venue', 'batting_team', 'bowling_team', 'batter', 'bowler', 'phase', 'pitch_type']
numeric_features = [f for f in features if f not in categorical_features]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# TRAIN-TEST SPLIT
X_train, X_test, y_rem_train, y_rem_test, y_next_train, y_next_test = train_test_split(
    X, y_remaining, y_next_over, test_size=0.2, random_state=42, stratify=df.loc[valid_indices, 'phase']
)

# Fit preprocessor
preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)

print(f"✅ Processed features: {X_train_proc.shape}")

# 🎯 REMAINING RUNS MODEL (replaces the final score model)
def build_remaining_runs_model(input_shape):
    """Model to predict remaining runs - same architecture as your successful model"""
    inputs = tf.keras.Input(shape=(input_shape,))
    
    # Use your successful architecture but for remaining runs
    x = layers.Dense(512, activation='swish', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    
    x = layers.Dense(256, activation='swish')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(128, activation='swish')(x)
    outputs = layers.Dense(1)(x)  # Can predict any positive remaining runs
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
        loss=Huber(),  # Same as your successful model
        metrics=['mae', 'mse']
    )
    return model

# NEXT OVER MODEL (keep your improved version)
def build_next_over_model_improved(input_shape):
    inputs = tf.keras.Input(shape=(input_shape,))
    
    x = layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = layers.Dropout(0.2)(x)
    
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(1, activation='relu')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mae',
        metrics=['mae', 'mse']
    )
    return model

# TRAINING CALLBACKS (keep your successful approach)
early_stopping_original = callbacks.EarlyStopping(
    patience=15,
    monitor='val_mae',
    restore_best_weights=True
)

reduce_lr_original = callbacks.ReduceLROnPlateau(
    factor=0.5,
    patience=7,
    min_lr=1e-6
)

class DynamicWeightAdjuster(callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs.get('val_mae') > logs.get('mae') * 1.1:
            lr = tf.keras.backend.get_value(self.model.optimizer.lr)
            tf.keras.backend.set_value(self.model.optimizer.lr, lr * 0.9)

# 🚀 TRAIN REMAINING RUNS MODEL
print("🚀 Training Remaining Runs Model (Fixed Data Leakage)...")
remaining_model = build_remaining_runs_model(X_train_proc.shape[1])

remaining_history = remaining_model.fit(
    X_train_proc, y_rem_train,
    epochs=200,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping_original, reduce_lr_original, DynamicWeightAdjuster()],
    verbose=1
)

# 🚀 TRAIN NEXT OVER MODEL
print("🚀 Training Next Over Model...")
over_model = build_next_over_model_improved(X_train_proc.shape[1])

early_stopping_conservative = callbacks.EarlyStopping(
    patience=20,
    monitor='val_mae',
    restore_best_weights=True,
    min_delta=0.01
)

reduce_lr_conservative = callbacks.ReduceLROnPlateau(
    factor=0.7,
    patience=10,
    min_lr=1e-5,
    min_delta=0.01
)

over_history = over_model.fit(
    X_train_proc, y_next_train,
    epochs=100,
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping_conservative, reduce_lr_conservative],
    verbose=1
)

# POST-PROCESSING (keep your original)
def postprocess_next_over_predictions(preds, context_data):
    processed_preds = []
    
    for i, pred in enumerate(preds):
        pred_val = pred[0] if hasattr(pred, '__len__') else pred
        
        over = context_data.iloc[i]['over'] if 'over' in context_data.columns else 10
        wickets = context_data.iloc[i]['cumulative_wickets'] if 'cumulative_wickets' in context_data.columns else 2
        phase = context_data.iloc[i]['phase'] if 'phase' in context_data.columns else 'Middle1'
        
        if phase == 'Powerplay':
            max_realistic = 18
        elif phase == 'Death':
            max_realistic = 20
        else:
            max_realistic = 15
        
        if wickets >= 8:
            pred_val *= 0.8
        elif wickets >= 6:
            pred_val *= 0.9
        
        pred_val = max(0, min(pred_val, max_realistic))
        processed_preds.append(pred_val)
    
    return np.array(processed_preds)

# EVALUATION
def evaluate_model(model, X_test, y_test, model_name, X_test_raw=None):
    y_pred = model.predict(X_test, verbose=0)
    
    if model_name == "Next Over Runs Model" and X_test_raw is not None:
        y_pred = postprocess_next_over_predictions(y_pred, X_test_raw)
        y_pred = y_pred.reshape(-1, 1)
    
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n✅ {model_name} Evaluation:")
    print(f"   MAE: {mae:.2f}")
    print(f"   R² Score: {r2:.3f}")
    
    if model_name == "Remaining Runs Model":
        print(f"   Mean Actual Remaining: {np.mean(y_test):.1f}")
        print(f"   Mean Predicted Remaining: {np.mean(y_pred):.1f}")
        
        # Test sample predictions
        print(f"\n🏏 Sample Remaining Runs Predictions:")
        for i in range(min(5, len(y_test))):
            actual = y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i]
            predicted = y_pred[i][0] if hasattr(y_pred[i], '__len__') else y_pred[i]
            print(f"   Actual: {actual:.1f}, Predicted: {predicted:.1f}")
    
    return mae, r2

print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)

remaining_mae, remaining_r2 = evaluate_model(remaining_model, X_test_proc, y_rem_test, "Remaining Runs Model")
over_mae, over_r2 = evaluate_model(over_model, X_test_proc, y_next_test, "Next Over Runs Model", X_test)

# SAVE MODELS WITH NEW NAMES
remaining_model.save("enhanced_remaining_runs_model.keras")  # New model
over_model.save("enhanced_next_over_model.keras")
joblib.dump(preprocessor, "enhanced_preprocessor.pkl")

print(f"\n✅ Models saved successfully!")
print(f"📋 Summary:")
print(f"   • Remaining Runs MAE: {remaining_mae:.2f} (target: <15)")
print(f"   • Remaining Runs R²: {remaining_r2:.3f} (target: >0.8)")
print(f"   • Next Over MAE: {over_mae:.2f} (target: <3.0)")

# 🔍 ENHANCED Data Leakage Check - More Comprehensive
print(f"\n🔍 Comprehensive Data Leakage Check:")

# Test multiple samples
test_samples = min(5, len(X_test))
for i in range(test_samples):
    test_sample = X_test.iloc[i:i+1]
    test_remaining = y_rem_test.iloc[i] if hasattr(y_rem_test, 'iloc') else y_rem_test[i]
    test_current = test_sample['cumulative_runs'].iloc[0]
    
    predicted_remaining = remaining_model.predict(preprocessor.transform(test_sample), verbose=0)[0][0]
    predicted_final = test_current + predicted_remaining
    actual_final = test_current + test_remaining
    
    error_remaining = abs(predicted_remaining - test_remaining)
    error_vs_current = abs(predicted_final - test_current)
    
    print(f"   Sample {i+1}:")
    print(f"     Current: {test_current:.1f}, Actual remaining: {test_remaining:.1f}")
    print(f"     Predicted remaining: {predicted_remaining:.1f} (error: {error_remaining:.1f})")
    print(f"     Final: {predicted_final:.1f} vs actual {actual_final:.1f}")

# Overall leakage detection
all_predictions = remaining_model.predict(X_test_proc, verbose=0).flatten()
remaining_errors = np.abs(all_predictions - y_rem_test)
mean_error = np.mean(remaining_errors)

print(f"\n📊 Overall Statistics:")
print(f"   Mean error in remaining runs: {mean_error:.2f}")
print(f"   Mean remaining runs: {np.mean(y_rem_test):.1f}")
print(f"   Error as % of mean remaining: {(mean_error/np.mean(y_rem_test)*100):.1f}%")

if mean_error < 5.0 and remaining_mae < 2.0:
    print("   ⚠️  LIKELY DATA LEAKAGE - Errors too small for cricket prediction")
    print("   🔧 Check your feature engineering for any future information")
elif mean_error < 15.0:
    print("   ✅ Model seems reasonable but monitor for leakage")
else:
    print("   ✅ No data leakage detected - model has realistic errors")

print("\n🎉 Training completed!")
print("💡 If data leakage is detected, review all feature calculations carefully.")

  df = df.groupby(['match_id', 'inning'], group_keys=False).apply(calculate_next_over_runs_improved)


🔧 Creating proper final score targets without data leakage...
✅ Remaining runs range: 0.0 to 287.0
✅ Average remaining runs: 84.4
✅ Samples after filtering: 257725
✅ Final samples: 253713
✅ Final feature matrix: (235954, 31)
✅ Remaining runs - Mean: 84.7, Std: 48.0
✅ Processed features: (188763, 1295)
🚀 Training Remaining Runs Model (Fixed Data Leakage)...
Epoch 1/200
[1m2360/2360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 27ms/step - loss: 46.9637 - mae: 27.9633 - mse: 1741.7047 - val_loss: 18.7947 - val_mae: 16.2857 - val_mse: 481.8070 - learning_rate: 5.0000e-04
Epoch 2/200
[1m2360/2360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 26ms/step - loss: 19.6793 - mae: 17.2188 - mse: 534.5770 - val_loss: 18.5952 - val_mae: 16.3521 - val_mse: 498.9345 - learning_rate: 5.0000e-04
Epoch 3/200
[1m2360/2360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 27ms/step - loss: 19.2247 - mae: 16.9899 - mse: 520.1692 - val_loss: 18.1291 - val_mae: 15.9114 - val_mse: 46

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor
import joblib

# Load dataset
df = pd.read_csv("final_dataset_with_pitch.csv")

## ENHANCED FEATURE ENGINEERING (Keep your original logic)
df = df.sort_values(by=['match_id', 'inning', 'over', 'ball'])

# Basic cumulative features
df['cumulative_runs'] = df.groupby(['match_id', 'inning'])['total_runs'].cumsum()
df['cumulative_wickets'] = df.groupby(['match_id', 'inning'])['is_wicket'].cumsum()

# Enhanced features with temporal context
df['run_rate'] = df['cumulative_runs'] / (df['over'] + 0.1)
df['strike_rate'] = df.groupby(['match_id', 'inning', 'batter'])['total_runs'].cumsum() / \
                   df.groupby(['match_id', 'inning', 'batter']).cumcount().add(1)

# Player performance metrics with recent form
for player_col in ['batter', 'bowler']:
    df[f'{player_col}_avg'] = df.groupby(player_col)['total_runs'].transform(
        lambda x: x.expanding().mean().shift(1))
    df[f'{player_col}_last5'] = df.groupby(player_col)['total_runs'].transform(
        lambda x: x.rolling(30).mean())

# Partnership dynamics
df['partnership_runs'] = df.groupby(['match_id', 'inning', 'batter', 'non_striker'])['total_runs'].cumsum()
df['partnership_balls'] = df.groupby(['match_id', 'inning', 'batter', 'non_striker']).cumcount() + 1
df['partnership_momentum'] = df['partnership_runs'] / df['partnership_balls']

# Bowler fatigue and recent performance
df['bowler_balls_bowled'] = df.groupby(['match_id', 'inning', 'bowler']).cumcount() + 1
df['bowler_recent_economy'] = df.groupby(['match_id', 'inning', 'bowler'])['total_runs'].rolling(12, min_periods=1).mean().reset_index(level=[0,1,2], drop=True)

# Match phase and pressure indicators
def get_phase(over):
    if over <= 6: return 'Powerplay'
    elif over <= 10: return 'Middle1'
    elif over <= 15: return 'Middle2'
    return 'Death'

df['phase'] = df['over'].apply(get_phase)
df['is_death_over'] = (df['over'] >= 16).astype(int)
df['runs_last_5_overs'] = df.groupby(['match_id', 'inning'])['total_runs'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)
df['wickets_last_5_overs'] = df.groupby(['match_id', 'inning'])['is_wicket'].rolling(30, min_periods=1).sum().reset_index(level=[0,1], drop=True)

# IMPROVED TARGET ENGINEERING for next over (keep your original)
def calculate_next_over_runs_improved(group):
    group = group.sort_values(['over', 'ball'])
    over_runs = []
    current_over = -1
    current_runs = 0
    
    for idx, row in group.iterrows():
        if row['over'] != current_over:
            if current_over >= 0:
                over_runs.append((current_over, current_runs))
            current_over = row['over']
            current_runs = row['total_runs']
        else:
            current_runs += row['total_runs']
    
    if current_over >= 0:
        over_runs.append((current_over, current_runs))
    
    over_to_runs = dict(over_runs)
    group['next_over_runs'] = group['over'].apply(
        lambda x: over_to_runs.get(x + 1, np.nan)
    )
    
    return group

df = df.groupby(['match_id', 'inning'], group_keys=False).apply(calculate_next_over_runs_improved)

# 🔧 FIXED: No data leakage for final score prediction
print("🔧 Creating proper final score targets without data leakage...")

# Calculate actual final scores for each match-inning
actual_final_scores = df.groupby(['match_id', 'inning'])['total_runs'].sum().reset_index()
actual_final_scores.columns = ['match_id', 'inning', 'actual_final_score']

# Merge to get the actual final score
df = pd.merge(df, actual_final_scores, on=['match_id', 'inning'])

# 🎯 KEY FIX: Create target as REMAINING RUNS instead of final score
df['remaining_runs'] = df['actual_final_score'] - df['cumulative_runs']

print(f"✅ Remaining runs range: {df['remaining_runs'].min():.1f} to {df['remaining_runs'].max():.1f}")
print(f"✅ Average remaining runs: {df['remaining_runs'].mean():.1f}")

# Filter out completed innings (remaining_runs <= 0)
df = df[df['remaining_runs'] > 0]
print(f"✅ Samples after filtering: {len(df)}")

# Match state features (keep your original logic)
df['wickets_remaining'] = 10 - df['cumulative_wickets']
df['resources_remaining'] = (20 - df['over']) * (df['wickets_remaining'] / 10)

# ⚠️ COMPLETELY REMOVE FEATURES THAT CAUSE DATA LEAKAGE
df['overs_remaining'] = 20 - df['over']

# Additional features for next over prediction (keep original)
df['balls_faced_current_over'] = df.groupby(['match_id', 'inning', 'over']).cumcount() + 1
df['runs_current_over'] = df.groupby(['match_id', 'inning', 'over'])['total_runs'].cumsum()
df['current_over_rate'] = df['runs_current_over'] / df['balls_faced_current_over']

# Historical over performance by phase
df['phase_avg_runs'] = df.groupby(['phase'])['total_runs'].transform('mean')

# DATA CLEANING (keep your original approach)
df.fillna({
    'batter_avg': df['total_runs'].mean(),
    'bowler_avg': df['total_runs'].mean(),
    'next_over_runs': df.groupby('phase')['total_runs'].transform('mean'),
    'bowler_recent_economy': df['total_runs'].mean(),
    'batter_last5': df['total_runs'].mean(),
    'bowler_last5': df['total_runs'].mean()
}, inplace=True)

# Remove outliers for remaining runs
q1 = df['remaining_runs'].quantile(0.01)
q99 = df['remaining_runs'].quantile(0.99)
df = df[(df['remaining_runs'] >= q1) & (df['remaining_runs'] <= q99)]

print(f"✅ Final samples: {len(df)}")

# More conservative outlier removal for next over (keep original)
next_over_valid = df['next_over_runs'].notna()
if next_over_valid.sum() > 0:
    q1_over = df.loc[next_over_valid, 'next_over_runs'].quantile(0.05)
    q95_over = df.loc[next_over_valid, 'next_over_runs'].quantile(0.95)
    df = df[(df['next_over_runs'] >= q1_over) & (df['next_over_runs'] <= q95_over)]

# FEATURE SELECTION - REMOVED DATA LEAKAGE FEATURES
features = [
    'venue', 'batting_team', 'bowling_team', 'batter', 'bowler',
    'over', 'cumulative_runs', 'cumulative_wickets', 'phase', 'pitch_type',
    'run_rate', 'strike_rate', 'batter_avg', 'bowler_avg',
    'partnership_runs', 'partnership_balls', 'bowler_balls_bowled',
    'bowler_recent_economy', 'wickets_remaining', 'resources_remaining',
    'is_death_over', 'runs_last_5_overs', 'wickets_last_5_overs',
    'partnership_momentum',
    'batter_last5', 'bowler_last5', 'balls_faced_current_over',
    'runs_current_over', 'current_over_rate', 'phase_avg_runs',
    'overs_remaining'
]

X = df[features]
y_remaining = df['remaining_runs']  # 🎯 NEW TARGET: remaining runs
y_next_over = df['next_over_runs']

# Remove rows where next_over_runs is NaN
valid_indices = y_next_over.notna()
X = X[valid_indices]
y_remaining = y_remaining[valid_indices]
y_next_over = y_next_over[valid_indices]

print(f"✅ Final feature matrix: {X.shape}")
print(f"✅ Remaining runs - Mean: {y_remaining.mean():.1f}, Std: {y_remaining.std():.1f}")

# PREPROCESSING - MODIFIED FOR XGBOOST
# XGBoost can handle categorical features directly, but let's encode for consistency
categorical_features = ['venue', 'batting_team', 'bowling_team', 'batter', 'bowler', 'phase', 'pitch_type']
numeric_features = [f for f in features if f not in categorical_features]

# For XGBoost, we'll use LabelEncoder instead of OneHotEncoder to reduce dimensionality
from sklearn.preprocessing import LabelEncoder

def create_xgboost_preprocessor():
    """Create a custom preprocessor optimized for XGBoost"""
    label_encoders = {}
    
    def fit_transform(X):
        X_encoded = X.copy()
        
        # Encode categorical features
        for col in categorical_features:
            if col in X_encoded.columns:
                le = LabelEncoder()
                X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
                label_encoders[col] = le
        
        # Scale only numeric features (XGBoost doesn't strictly need this, but can help)
        scaler = StandardScaler()
        X_encoded[numeric_features] = scaler.fit_transform(X_encoded[numeric_features])
        
        return X_encoded, label_encoders, scaler
    
    def transform(X, label_encoders, scaler):
        X_encoded = X.copy()
        
        # Encode categorical features
        for col in categorical_features:
            if col in X_encoded.columns and col in label_encoders:
                # Handle unknown categories
                le = label_encoders[col]
                X_encoded[col] = X_encoded[col].astype(str)
                
                # Map unknown values to a default (most frequent class)
                unknown_mask = ~X_encoded[col].isin(le.classes_)
                if unknown_mask.any():
                    most_frequent = le.classes_[0]  # Use first class as default
                    X_encoded.loc[unknown_mask, col] = most_frequent
                
                X_encoded[col] = le.transform(X_encoded[col])
        
        # Scale numeric features
        X_encoded[numeric_features] = scaler.transform(X_encoded[numeric_features])
        
        return X_encoded
    
    return fit_transform, transform

# TRAIN-TEST SPLIT
X_train, X_test, y_rem_train, y_rem_test, y_next_train, y_next_test = train_test_split(
    X, y_remaining, y_next_over, test_size=0.2, random_state=42, stratify=df.loc[valid_indices, 'phase']
)

# Apply preprocessing
fit_transform_func, transform_func = create_xgboost_preprocessor()
X_train_proc, label_encoders, scaler = fit_transform_func(X_train)
X_test_proc = transform_func(X_test, label_encoders, scaler)

print(f"✅ Processed features for XGBoost: {X_train_proc.shape}")

# 🚀 XGBOOST REMAINING RUNS MODEL
def create_remaining_runs_xgboost():
    """XGBoost model optimized for remaining runs prediction"""
    model = XGBRegressor(
        # Core parameters
        n_estimators=1000,
        max_depth=8,
        learning_rate=0.1,
        
        # Regularization
        reg_alpha=0.1,     # L1 regularization
        reg_lambda=1.0,    # L2 regularization
        gamma=0.1,         # Minimum split loss
        
        # Sampling
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        
        # Performance
        random_state=42,
        n_jobs=-1,
        
        # Early stopping will be handled in fit()
        early_stopping_rounds=50,
        
        # Evaluation metric
        eval_metric='mae'
    )
    return model

# 🚀 XGBOOST NEXT OVER MODEL  
def create_next_over_xgboost():
    """XGBoost model optimized for next over runs prediction"""
    model = XGBRegressor(
        # Smaller model for next over (simpler task)
        n_estimators=500,
        max_depth=6,
        learning_rate=0.15,
        
        # Regularization
        reg_alpha=0.05,
        reg_lambda=0.5,
        gamma=0.05,
        
        # Sampling
        subsample=0.9,
        colsample_bytree=0.9,
        
        # Performance
        random_state=42,
        n_jobs=-1,
        
        # Early stopping
        early_stopping_rounds=30,
        
        # Evaluation metric
        eval_metric='mae'
    )
    return model

# TRAINING REMAINING RUNS MODEL
print("🚀 Training XGBoost Remaining Runs Model...")
remaining_model = create_remaining_runs_xgboost()

# Split training data for validation
X_train_split, X_val_split, y_rem_train_split, y_rem_val_split = train_test_split(
    X_train_proc, y_rem_train, test_size=0.2, random_state=42
)

remaining_model.fit(
    X_train_split, y_rem_train_split,
    eval_set=[(X_val_split, y_rem_val_split)],
    verbose=50  # Print every 50 rounds
)

# TRAINING NEXT OVER MODEL
print("🚀 Training XGBoost Next Over Model...")
over_model = create_next_over_xgboost()

X_train_over_split, X_val_over_split, y_next_train_split, y_next_val_split = train_test_split(
    X_train_proc, y_next_train, test_size=0.2, random_state=42
)

over_model.fit(
    X_train_over_split, y_next_train_split,
    eval_set=[(X_val_over_split, y_next_val_split)],
    verbose=25  # Print every 25 rounds
)

# POST-PROCESSING (keep your original)
def postprocess_next_over_predictions(preds, context_data):
    processed_preds = []
    
    for i, pred in enumerate(preds):
        pred_val = pred if not hasattr(pred, '__len__') else pred
        
        over = context_data.iloc[i]['over'] if 'over' in context_data.columns else 10
        wickets = context_data.iloc[i]['cumulative_wickets'] if 'cumulative_wickets' in context_data.columns else 2
        phase = context_data.iloc[i]['phase'] if 'phase' in context_data.columns else 'Middle1'
        
        if phase == 'Powerplay':
            max_realistic = 18
        elif phase == 'Death':
            max_realistic = 20
        else:
            max_realistic = 15
        
        if wickets >= 8:
            pred_val *= 0.8
        elif wickets >= 6:
            pred_val *= 0.9
        
        pred_val = max(0, min(pred_val, max_realistic))
        processed_preds.append(pred_val)
    
    return np.array(processed_preds)

# EVALUATION - SIMPLIFIED
def evaluate_xgboost_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"✅ {model_name}:")
    print(f"   MAE: {mae:.2f}")
    print(f"   R²: {r2:.3f}")
    
    return mae, r2

print("\n" + "="*40)
print("MODEL EVALUATION RESULTS")
print("="*40)

remaining_mae, remaining_r2 = evaluate_xgboost_model(remaining_model, X_test_proc, y_rem_test, "XGBoost Remaining Runs")
over_mae, over_r2 = evaluate_xgboost_model(over_model, X_test_proc, y_next_test, "XGBoost Next Over")

# TOP 5 FEATURE IMPORTANCE ONLY
importance_remaining = remaining_model.feature_importances_
feature_names = X_train_proc.columns.tolist()
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance_remaining
}).sort_values('importance', ascending=False)

print(f"\n🎯 Top 5 Most Important Features:")
for i, row in importance_df.head(5).iterrows():
    print(f"   {row['feature']}: {row['importance']:.3f}")

# SAVE MODELS
remaining_model.save_model("xgboost_remaining_runs_model.json")
over_model.save_model("xgboost_next_over_model.json")

preprocessing_data = {
    'label_encoders': label_encoders,
    'scaler': scaler,
    'feature_names': feature_names,
    'categorical_features': categorical_features,
    'numeric_features': numeric_features
}
joblib.dump(preprocessing_data, "xgboost_preprocessor.pkl")

print(f"\n📋 FINAL COMPARISON:")
print(f"   ANN Remaining Runs MAE: 13.95")
print(f"   XGBoost Remaining Runs MAE: {remaining_mae:.2f}")
if remaining_mae < 13.95:
    improvement = ((13.95 - remaining_mae) / 13.95) * 100
    print(f"   🏆 XGBoost is {improvement:.1f}% better than ANN!")
else:
    print(f"   ⚠️  ANN performed better by {remaining_mae - 13.95:.2f} MAE")

print(f"\n✅ XGBoost models saved successfully!")

# Check for data leakage - SIMPLIFIED
all_predictions = remaining_model.predict(X_test_proc)
remaining_errors = np.abs(all_predictions - y_rem_test)
mean_error = np.mean(remaining_errors)

if mean_error < 5.0:
    print("   ⚠️  Possible data leakage - very low errors")
else:
    print("   ✅ No data leakage detected")
    

  df = df.groupby(['match_id', 'inning'], group_keys=False).apply(calculate_next_over_runs_improved)


🔧 Creating proper final score targets without data leakage...
✅ Remaining runs range: 0.0 to 287.0
✅ Average remaining runs: 84.4
✅ Samples after filtering: 257725
✅ Final samples: 253713
✅ Final feature matrix: (235954, 31)
✅ Remaining runs - Mean: 84.7, Std: 48.0
✅ Processed features for XGBoost: (188763, 31)
🚀 Training XGBoost Remaining Runs Model...
[0]	validation_0-mae:37.21410


KeyboardInterrupt: 

In [1]:
accuracy=postprocess_next_over_predictions.evaluate(X_test,Y_test)

NameError: name 'postprocess_next_over_predictions' is not defined