# AI Model Development for Football Analytics

This notebook implements various machine learning models for football player performance prediction.

## Contents
1. Data Preparation and Splitting
2. Baseline Models (Linear Regression, Random Forest)
3. Advanced Models (XGBoost, Neural Networks)
4. Time Series Models (Prophet, LSTM)
5. Ensemble Methods
6. Model Evaluation and Comparison
7. Model Interpretation and Feature Importance

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
import torch
import torch.nn as nn
from prophet import Prophet
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 1. Data Preparation and Splitting

## Data Processing & Feature Engineering

This section implements comprehensive feature engineering for football analytics, leveraging the 2025/26 season data (4 rounds completed as of September 2025).

### Feature Categories Overview:
1. **Position-Specific Performance Metrics**
2. **Form & Momentum Indicators** 
3. **Fixture Difficulty & Context**
4. **Value & Ownership Dynamics**
5. **Team Performance Integration**
6. **Advanced Statistical Features**
7. **Rolling Window Aggregations**
8. **Seasonal Progression Features**

In [None]:
class FootballFeatureEngineer:
    """
    Comprehensive feature engineering for football analytics
    Designed for 2025/26 season data with 4+ rounds completed
    """
    
    def __init__(self):
        self.position_mappings = {
            1: 'GKP',  # Goalkeeper
            2: 'DEF',  # Defender  
            3: 'MID',  # Midfielder
            4: 'FWD'   # Forward/Attacker
        }
        
        self.difficulty_weights = {
            1: 0.2, 2: 0.4, 3: 0.6, 4: 0.8, 5: 1.0  # FDR scale
        }
    
    def load_data_from_db(self, db_path='data/fpl_data.db'):
        """Load data from SQLite database"""
        import sqlite3
        
        conn = sqlite3.connect(db_path)
        
        # Load players data
        players_df = pd.read_sql_query("""
            SELECT * FROM players 
            ORDER BY gameweek, player_id
        """, conn)
        
        # Load teams data  
        teams_df = pd.read_sql_query("""
            SELECT * FROM teams
            ORDER BY gameweek, team_id
        """, conn)
        
        # Load matches data
        matches_df = pd.read_sql_query("""
            SELECT * FROM matches
            ORDER BY gameweek, match_id
        """, conn)
        
        conn.close()
        
        return players_df, teams_df, matches_df
    
    def create_position_specific_features(self, df):
        """
        Enhanced position-specific metrics based on your requirements
        """
        df = df.copy()
        
        # Map position codes to names
        df['position_name'] = df['element_type'].map(self.position_mappings)
        
        # GOALKEEPERS - Enhanced metrics
        gkp_features = []
        if 'GKP' in df['position_name'].values:
            # Core GKP metrics (your original plan)
            df['save_percentage'] = np.where(
                df['saves'] + df['goals_conceded'] > 0,
                df['saves'] / (df['saves'] + df['goals_conceded']),
                0
            )
            df['clean_sheet_rate'] = df['clean_sheets'] / df['minutes'].clip(lower=1) * 90
            
            # Enhanced GKP features
            df['saves_per_90'] = df['saves'] / df['minutes'].clip(lower=1) * 90
            df['penalties_saved_rate'] = df['penalties_saved'] / df['penalties_faced'].clip(lower=1)
            df['goals_conceded_per_90'] = df['goals_conceded'] / df['minutes'].clip(lower=1) * 90
            df['distribution_accuracy'] = df['passes_completed'] / df['passes_attempted'].clip(lower=1)
            
            # Advanced GKP metrics
            df['gkp_value_rating'] = (
                df['save_percentage'] * 0.3 +
                df['clean_sheet_rate'] * 0.4 +
                df['distribution_accuracy'] * 0.2 +
                (1 - df['goals_conceded_per_90'] / 3) * 0.1  # Normalize goals conceded
            )
            
            gkp_features = ['save_percentage', 'clean_sheet_rate', 'saves_per_90', 
                           'penalties_saved_rate', 'goals_conceded_per_90', 
                           'distribution_accuracy', 'gkp_value_rating']
        
        # DEFENDERS - Enhanced metrics
        def_features = []
        if 'DEF' in df['position_name'].values:
            # Core DEF metrics (your original plan)
            df['tackles_won_rate'] = df['tackles'] / (df['tackles'] + 1)  # Avoid div by zero
            df['interceptions_per_90'] = df['interceptions'] / df['minutes'].clip(lower=1) * 90
            df['aerial_duels_won_rate'] = df['aerial_duels_won'] / df['aerial_duels_attempted'].clip(lower=1)
            df['pass_accuracy'] = df['passes_completed'] / df['passes_attempted'].clip(lower=1)
            
            # Enhanced DEF features
            df['defensive_actions_per_90'] = (
                df['tackles'] + df['interceptions'] + df['clearances']
            ) / df['minutes'].clip(lower=1) * 90
            
            df['attacking_threat_def'] = (
                df['goals_scored'] * 6 + 
                df['assists'] * 3 + 
                df['key_passes'] + 
                df['shots']
            )
            
            df['defensive_reliability'] = (
                df['clean_sheets'] * 0.4 +
                (df['tackles'] + df['interceptions']) * 0.3 +
                df['aerial_duels_won'] * 0.2 +
                df['pass_accuracy'] * 0.1
            )
            
            # Bonus point potential
            df['def_bonus_potential'] = (
                df['goals_scored'] * 2 +
                df['assists'] +
                df['clean_sheets'] +
                df['defensive_actions_per_90'] / 10
            )
            
            def_features = ['tackles_won_rate', 'interceptions_per_90', 'aerial_duels_won_rate',
                           'pass_accuracy', 'defensive_actions_per_90', 'attacking_threat_def',
                           'defensive_reliability', 'def_bonus_potential']
        
        # MIDFIELDERS - Enhanced metrics
        mid_features = []
        if 'MID' in df['position_name'].values:
            # Core MID metrics (your original plan)
            df['key_passes_per_90'] = df['key_passes'] / df['minutes'].clip(lower=1) * 90
            df['dribbles_success_rate'] = df['dribbles_completed'] / df['dribbles_attempted'].clip(lower=1)
            df['possession_impact'] = df['passes_attempted'] / df['minutes'].clip(lower=1) * 90
            
            # Enhanced MID features
            df['creativity_index'] = (
                df['key_passes'] * 2 +
                df['assists'] * 3 +
                df['dribbles_completed'] +
                df['shots'] * 0.5
            )
            
            df['work_rate_index'] = (
                df['tackles'] +
                df['interceptions'] +
                df['passes_attempted'] / 20  # Scale down passes
            )
            
            df['attacking_mid_potential'] = (
                df['goals_scored'] * 5 +
                df['assists'] * 3 +
                df['key_passes'] * 1.5 +
                df['shots'] * 0.8
            )
            
            df['defensive_mid_value'] = (
                df['tackles'] * 1.5 +
                df['interceptions'] * 1.2 +
                df['pass_accuracy'] * 10 +
                df['possession_impact'] * 0.1
            )
            
            # Box-to-box rating
            df['box_to_box_rating'] = (
                df['attacking_mid_potential'] * 0.4 +
                df['defensive_mid_value'] * 0.4 +
                df['creativity_index'] * 0.2
            )
            
            mid_features = ['key_passes_per_90', 'dribbles_success_rate', 'possession_impact',
                           'creativity_index', 'work_rate_index', 'attacking_mid_potential',
                           'defensive_mid_value', 'box_to_box_rating']
        
        # FORWARDS/ATTACKERS - Enhanced metrics  
        fwd_features = []
        if 'FWD' in df['position_name'].values:
            # Core FWD metrics (your original plan)
            df['goals_per_90'] = df['goals_scored'] / df['minutes'].clip(lower=1) * 90
            df['shot_conversion_rate'] = df['goals_scored'] / df['shots'].clip(lower=1)
            
            # Expected Goals calculation (simplified)
            df['xG_per_shot'] = np.where(
                df['shots'] > 0,
                (df['shots_on_target'] * 0.3 + df['shots_off_target'] * 0.1) / df['shots'],
                0
            )
            df['expected_goals'] = df['shots'] * df['xG_per_shot']
            
            # Enhanced FWD features
            df['attacking_threat'] = (
                df['shots'] * 1.5 +
                df['shots_on_target'] * 2 +
                df['key_passes'] +
                df['assists'] * 3
            )
            
            df['clinical_finishing'] = np.where(
                df['expected_goals'] > 0,
                df['goals_scored'] / df['expected_goals'],
                df['shot_conversion_rate']
            )
            
            df['penalty_specialist'] = (
                df['penalties_scored'] / df['penalties_attempted'].clip(lower=1)
            )
            
            df['big_chance_rating'] = (
                df['goals_scored'] * 0.4 +
                df['assists'] * 0.3 +
                df['shot_conversion_rate'] * 10 * 0.2 +
                df['attacking_threat'] * 0.1
            )
            
            # Differential pick potential
            df['fwd_differential_score'] = (
                df['goals_per_90'] * df['minutes'] / 90 * 
                (1 - df['selected_by_percent'] / 100)  # Reward low ownership
            )
            
            fwd_features = ['goals_per_90', 'shot_conversion_rate', 'xG_per_shot', 'expected_goals',
                           'attacking_threat', 'clinical_finishing', 'penalty_specialist',
                           'big_chance_rating', 'fwd_differential_score']
        
        # Store feature lists for later use
        self.position_features = {
            'GKP': gkp_features,
            'DEF': def_features, 
            'MID': mid_features,
            'FWD': fwd_features
        }
        
        return df

In [None]:
    def create_form_momentum_features(self, df):
        """
        Form and momentum indicators - crucial for FPL success
        """
        df = df.copy()
        df = df.sort_values(['player_id', 'gameweek'])
        
        # Rolling form windows (last 3, 5, 10 games)
        windows = [3, 5, 10]
        
        for window in windows:
            # Points form
            df[f'points_form_{window}'] = (
                df.groupby('player_id')['total_points']
                .rolling(window=window, min_periods=1)
                .mean()
                .reset_index(0, drop=True)
            )
            
            # Goals form
            df[f'goals_form_{window}'] = (
                df.groupby('player_id')['goals_scored']
                .rolling(window=window, min_periods=1)
                .sum()
                .reset_index(0, drop=True)
            )
            
            # Minutes consistency
            df[f'minutes_consistency_{window}'] = (
                df.groupby('player_id')['minutes']
                .rolling(window=window, min_periods=1)
                .std()
                .fillna(0)
                .reset_index(0, drop=True)
            )
        
        # Momentum indicators
        df['points_trend'] = (
            df.groupby('player_id')['total_points']
            .diff()
            .fillna(0)
        )
        
        df['improving_form'] = (
            df['points_form_3'] > df['points_form_5']
        ).astype(int)
        
        df['declining_form'] = (
            df['points_form_3'] < df['points_form_5']
        ).astype(int)
        
        # Hot streak detection
        df['consecutive_returns'] = (
            df.groupby('player_id')
            .apply(lambda x: x['total_points'].gt(0).groupby((x['total_points'] <= 0).cumsum()).cumsum())
            .reset_index(0, drop=True)
        )
        
        # Blank streak (important for transfers out)
        df['consecutive_blanks'] = (
            df.groupby('player_id')
            .apply(lambda x: x['total_points'].eq(0).groupby((x['total_points'] > 0).cumsum()).cumsum())
            .reset_index(0, drop=True)
        )
        
        return df
    
    def create_fixture_context_features(self, df, matches_df):
        """
        Fixture difficulty and context - essential for captain picks
        """
        df = df.copy()
        
        # Merge with fixture data
        df = df.merge(
            matches_df[['gameweek', 'team_id', 'difficulty', 'home_away', 'opponent']],
            on=['gameweek', 'team_id'],
            how='left'
        )
        
        # Home/away form splits
        for venue in ['home', 'away']:
            venue_mask = df['home_away'] == venue
            
            df[f'points_form_3_{venue}'] = (
                df[venue_mask].groupby('player_id')['total_points']
                .rolling(window=3, min_periods=1)
                .mean()
                .reindex(df.index)
                .fillna(0)
            )
        
        # Difficulty-adjusted metrics
        df['difficulty_weight'] = df['difficulty'].map(self.difficulty_weights).fillna(0.6)
        
        df['difficulty_adjusted_points'] = (
            df['total_points'] / df['difficulty_weight']
        )
        
        # Fixture swing analysis
        df['next_fixture_difficulty'] = (
            df.groupby('player_id')['difficulty']
            .shift(-1)
            .fillna(3)  # Average difficulty
        )
        
        df['fixture_swing'] = (
            df['next_fixture_difficulty'] - df['difficulty']
        )
        
        # Double gameweek potential (placeholder for future)
        df['double_gameweek_potential'] = 0  # Will be updated when DGWs announced
        
        return df
    
    def create_value_ownership_features(self, df):
        """
        Value and ownership dynamics - crucial for template avoidance
        """
        df = df.copy()
        
        # Price change tracking
        df['price_change'] = (
            df.groupby('player_id')['now_cost']
            .diff()
            .fillna(0)
        )
        
        df['price_change_total'] = (
            df.groupby('player_id')['price_change']
            .cumsum()
        )
        
        # Value metrics
        df['points_per_million'] = (
            df['total_points'] / (df['now_cost'] / 10)
        )
        
        df['value_form_3'] = (
            df['points_form_3'] / (df['now_cost'] / 10)
        )
        
        # Ownership dynamics
        df['ownership_change'] = (
            df.groupby('player_id')['selected_by_percent']
            .diff()
            .fillna(0)
        )
        
        df['template_player'] = (
            df['selected_by_percent'] > 50
        ).astype(int)
        
        df['differential_player'] = (
            df['selected_by_percent'] < 10
        ).astype(int)
        
        # Transfer activity
        df['transfer_momentum'] = (
            df['transfers_in'] - df['transfers_out']
        )
        
        df['transfer_momentum_pct'] = (
            df['transfer_momentum'] / df['selected_by_percent'].clip(lower=1) * 100
        )
        
        # Value opportunity score
        df['value_opportunity'] = (
            df['points_per_million'] * 
            (1 - df['selected_by_percent'] / 100) *  # Lower ownership bonus
            np.exp(-df['price_change_total'] / 5)    # Recent price rise penalty
        )
        
        return df

In [None]:
    def create_team_integration_features(self, df, teams_df):
        """
        Team performance integration - individual performance in team context
        """
        df = df.copy()
        
        # Merge team performance
        df = df.merge(
            teams_df[['gameweek', 'team_id', 'strength_overall_home', 'strength_overall_away',
                     'strength_attack_home', 'strength_attack_away', 
                     'strength_defence_home', 'strength_defence_away']],
            on=['gameweek', 'team_id'],
            how='left'
        )
        
        # Team strength context
        df['team_attack_strength'] = np.where(
            df['home_away'] == 'home',
            df['strength_attack_home'],
            df['strength_attack_away']
        )
        
        df['team_defence_strength'] = np.where(
            df['home_away'] == 'home', 
            df['strength_defence_home'],
            df['strength_defence_away']
        )
        
        # Player contribution to team
        team_totals = df.groupby(['gameweek', 'team_id']).agg({
            'goals_scored': 'sum',
            'assists': 'sum', 
            'total_points': 'sum'
        }).reset_index()
        
        team_totals.columns = ['gameweek', 'team_id', 'team_goals', 'team_assists', 'team_points']
        
        df = df.merge(team_totals, on=['gameweek', 'team_id'], how='left')
        
        # Contribution percentages
        df['goal_contribution_pct'] = (
            (df['goals_scored'] + df['assists']) / df['team_goals'].clip(lower=1) * 100
        )
        
        df['points_contribution_pct'] = (
            df['total_points'] / df['team_points'].clip(lower=1) * 100
        )
        
        # Team performance indicators
        df['team_over_performing'] = (
            df['team_points'] > df[['team_attack_strength', 'team_defence_strength']].mean(axis=1) * 2
        ).astype(int)
        
        return df
    
    def create_advanced_statistical_features(self, df):
        """
        Advanced statistical features for model enhancement
        """
        df = df.copy()
        
        # Performance consistency metrics
        df['points_variance'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=2)
            .var()
            .reset_index(0, drop=True)
            .fillna(0)
        )
        
        df['consistency_score'] = (
            df['points_form_5'] / (df['points_variance'] + 1)  # Add 1 to avoid division by zero
        )
        
        # Ceiling and floor analysis
        df['points_ceiling'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=10, min_periods=3)
            .max()
            .reset_index(0, drop=True)
        )
        
        df['points_floor'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=10, min_periods=3)
            .min()
            .reset_index(0, drop=True)
        )
        
        # Risk-adjusted returns
        df['sharpe_ratio'] = (
            df['points_form_5'] / (df['points_variance'] + 0.1)
        )
        
        # Bonus point propensity
        df['bonus_rate'] = (
            df.groupby('player_id')['bonus']
            .rolling(window=5, min_periods=1)
            .mean()
            .reset_index(0, drop=True)
        )
        
        # Minutes security
        df['minutes_security'] = (
            df.groupby('player_id')['minutes']
            .rolling(window=5, min_periods=1)
            .apply(lambda x: (x >= 60).sum() / len(x))
            .reset_index(0, drop=True)
        )
        
        # Yellow/red card risk
        df['card_risk'] = (
            df.groupby('player_id')['yellow_cards']
            .rolling(window=10, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        ) + (
            df.groupby('player_id')['red_cards']
            .rolling(window=10, min_periods=1)
            .sum()
            .reset_index(0, drop=True) * 3
        )
        
        return df
    
    def create_seasonal_progression_features(self, df):
        """
        Seasonal progression and timing features
        """
        df = df.copy()
        
        # Season phase indicators
        df['season_phase'] = pd.cut(
            df['gameweek'], 
            bins=[0, 10, 19, 29, 38],
            labels=['Early', 'Autumn', 'Winter', 'Spring']
        )
        
        # Gameweek timing features
        df['gameweek_sin'] = np.sin(2 * np.pi * df['gameweek'] / 38)
        df['gameweek_cos'] = np.cos(2 * np.pi * df['gameweek'] / 38)
        
        # Progressive season metrics
        df['season_total_points'] = (
            df.groupby('player_id')['total_points']
            .cumsum()
        )
        
        df['season_average_points'] = (
            df['season_total_points'] / df['gameweek']
        )
        
        # Fixture congestion (Premier League context)
        df['fixture_congestion'] = 0  # Placeholder - would calculate based on UCL/UEL participation
        
        # International break effect
        df['post_international_break'] = 0  # Placeholder - would identify post-break gameweeks
        
        return df
    
    def create_all_features(self, players_df, teams_df, matches_df):
        """
        Main method to create all features
        """
        print("🔧 Starting comprehensive feature engineering...")
        
        # Start with position-specific features
        df = self.create_position_specific_features(players_df)
        print("✅ Position-specific features created")
        
        # Add form and momentum
        df = self.create_form_momentum_features(df)
        print("✅ Form and momentum features created")
        
        # Add fixture context
        df = self.create_fixture_context_features(df, matches_df)
        print("✅ Fixture context features created")
        
        # Add value and ownership
        df = self.create_value_ownership_features(df)
        print("✅ Value and ownership features created")
        
        # Add team integration
        df = self.create_team_integration_features(df, teams_df)
        print("✅ Team integration features created")
        
        # Add advanced statistical features
        df = self.create_advanced_statistical_features(df)
        print("✅ Advanced statistical features created")
        
        # Add seasonal progression
        df = self.create_seasonal_progression_features(df)
        print("✅ Seasonal progression features created")
        
        # Add enhanced specific features (user requirements)
        df = self.create_enhanced_form_indicators(df)
        print("✅ Enhanced 5-game form indicators created")
        
        df = self.create_opposition_difficulty_ratings(df, teams_df)
        print("✅ Opposition difficulty ratings created")
        
        df = self.create_fatigue_metrics(df)
        print("✅ Fatigue metrics created")
        
        df = self.create_home_advantage_factors(df)
        print("✅ Home advantage factors created")
        
        print(f"🎯 Feature engineering complete! Created {len(df.columns)} total features")
        
        return df
    
    def get_feature_summary(self, df):
        """
        Get summary of created features by category
        """
        summary = {
            'Total Features': len(df.columns),
            'Position-Specific': sum(len(features) for features in self.position_features.values()),
            'Form & Momentum': len([col for col in df.columns if 'form' in col or 'trend' in col or 'momentum' in col]),
            'Fixture Context': len([col for col in df.columns if 'difficulty' in col or 'fixture' in col]),
            'Value & Ownership': len([col for col in df.columns if 'value' in col or 'ownership' in col or 'transfer' in col]),
            'Team Integration': len([col for col in df.columns if 'team' in col or 'contribution' in col]),
            'Advanced Statistical': len([col for col in df.columns if any(x in col for x in ['variance', 'consistency', 'sharpe', 'ceiling', 'floor'])]),
            'Seasonal Progression': len([col for col in df.columns if any(x in col for x in ['season', 'gameweek_sin', 'gameweek_cos', 'phase'])]),
            'Enhanced 5-Game Form': len([col for col in df.columns if 'form_5' in col]),
            'Opposition Difficulty': len([col for col in df.columns if any(x in col for x in ['next_3_fixtures', 'next_5_fixtures', 'vs_difficulty'])]),
            'Fatigue Metrics': len([col for col in df.columns if any(x in col for x in ['minutes_last', 'workload', 'rotation', 'overplay'])]),
            'Home Advantage': len([col for col in df.columns if any(x in col for x in ['home_advantage', 'venue', 'home_form', 'away_form'])])
        }
        
        return summary

In [None]:
    def create_enhanced_form_indicators(self, df):
        """
        Enhanced form indicators with specific focus on last 5 games performance
        """
        df = df.copy()
        df = df.sort_values(['player_id', 'gameweek'])
        
        # Specific 5-game form indicators (your requirement)
        df['form_5_total_points'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        )
        
        df['form_5_avg_points'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=1)
            .mean()
            .reset_index(0, drop=True)
        )
        
        df['form_5_goals'] = (
            df.groupby('player_id')['goals_scored']
            .rolling(window=5, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        )
        
        df['form_5_assists'] = (
            df.groupby('player_id')['assists']
            .rolling(window=5, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        )
        
        df['form_5_clean_sheets'] = (
            df.groupby('player_id')['clean_sheets']
            .rolling(window=5, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        )
        
        # Form quality indicators
        df['form_5_consistency'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=2)
            .std()
            .fillna(0)
            .reset_index(0, drop=True)
        )
        
        # Form trend over last 5 games
        df['form_5_trend'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=2)
            .apply(lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) >= 2 else 0)
            .reset_index(0, drop=True)
        )
        
        # Hot form indicator (3+ returns in last 5)
        df['hot_form_5'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=1)
            .apply(lambda x: (x > 0).sum() >= 3)
            .astype(int)
            .reset_index(0, drop=True)
        )
        
        # Cold form indicator (3+ blanks in last 5)
        df['cold_form_5'] = (
            df.groupby('player_id')['total_points']
            .rolling(window=5, min_periods=1)
            .apply(lambda x: (x == 0).sum() >= 3)
            .astype(int)
            .reset_index(0, drop=True)
        )
        
        return df
    
    def create_opposition_difficulty_ratings(self, df, teams_df):
        """
        Enhanced opposition difficulty ratings and upcoming fixture analysis
        """
        df = df.copy()
        
        # Create team strength lookup
        team_strength = teams_df.groupby('team_id').agg({
            'strength_overall_home': 'mean',
            'strength_overall_away': 'mean',
            'strength_attack_home': 'mean',
            'strength_attack_away': 'mean',
            'strength_defence_home': 'mean',
            'strength_defence_away': 'mean'
        }).reset_index()
        
        # Merge team strengths
        df = df.merge(team_strength, on='team_id', how='left', suffixes=('', '_team'))
        
        # Enhanced difficulty metrics
        df['opponent_defensive_strength'] = 0  # Placeholder - would get from opponent data
        df['opponent_attacking_strength'] = 0  # Placeholder - would get from opponent data
        
        # Difficulty-weighted recent performance
        df['difficulty_weighted_form_5'] = (
            df['form_5_avg_points'] * (6 - df['difficulty']) / 5  # Boost harder fixtures
        )
        
        # Next 3 fixtures difficulty (for transfer planning)
        df['next_3_fixtures_difficulty'] = (
            df.groupby('player_id')['difficulty']
            .shift(-1).fillna(3) +
            df.groupby('player_id')['difficulty']
            .shift(-2).fillna(3) +
            df.groupby('player_id')['difficulty']
            .shift(-3).fillna(3)
        ) / 3
        
        # Next 5 fixtures difficulty (season planning)
        next_5_difficulties = []
        for i in range(1, 6):
            next_5_difficulties.append(
                df.groupby('player_id')['difficulty']
                .shift(-i).fillna(3)
            )
        df['next_5_fixtures_difficulty'] = sum(next_5_difficulties) / 5
        
        # Fixture difficulty swing
        df['difficulty_improvement'] = (
            df['difficulty'] - df['next_3_fixtures_difficulty']
        )
        
        # Historical performance vs similar difficulty
        for diff_level in [1, 2, 3, 4, 5]:
            df[f'vs_difficulty_{diff_level}_avg'] = (
                df[df['difficulty'] == diff_level]
                .groupby('player_id')['total_points']
                .transform('mean')
            )
        
        return df
    
    def create_fatigue_metrics(self, df):
        """
        Fatigue metrics based on minutes played recently
        """
        df = df.copy()
        df = df.sort_values(['player_id', 'gameweek'])
        
        # Minutes played in last 3, 5, and 10 games
        for window in [3, 5, 10]:
            df[f'minutes_last_{window}'] = (
                df.groupby('player_id')['minutes']
                .rolling(window=window, min_periods=1)
                .sum()
                .reset_index(0, drop=True)
            )
            
            # Average minutes per game
            df[f'avg_minutes_last_{window}'] = (
                df[f'minutes_last_{window}'] / 
                df.groupby('player_id').cumcount().clip(upper=window-1).add(1)
            )
        
        # Fatigue indicators
        df['high_minutes_workload'] = (df['minutes_last_5'] > 400).astype(int)  # 80+ min avg
        df['rotation_risk'] = (df['minutes_last_3'] < 180).astype(int)  # Less than 60 min avg
        df['minutes_secure'] = (df['avg_minutes_last_5'] > 75).astype(int)  # Consistent starter
        
        # International break recovery (placeholder)
        df['post_international_break'] = 0  # Would be set for specific gameweeks
        
        # Fixture congestion (midweek games, cup competitions)
        df['fixture_congestion_risk'] = 0  # Placeholder for European competition analysis
        
        # Minutes trend
        df['minutes_trend'] = (
            df.groupby('player_id')['minutes']
            .rolling(window=5, min_periods=2)
            .apply(lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) >= 2 else 0)
            .reset_index(0, drop=True)
        )
        
        # Rest advantage (games with 0 minutes indicating rest)
        df['games_rested_last_5'] = (
            df.groupby('player_id')['minutes']
            .rolling(window=5, min_periods=1)
            .apply(lambda x: (x == 0).sum())
            .reset_index(0, drop=True)
        )
        
        # Overplay risk (multiple 90+ minute games)
        df['overplay_risk_last_5'] = (
            df.groupby('player_id')['minutes']
            .rolling(window=5, min_periods=1)
            .apply(lambda x: (x >= 90).sum())
            .reset_index(0, drop=True)
        )
        
        return df
    
    def create_home_advantage_factors(self, df):
        """
        Home advantage factors and venue-specific performance
        """
        df = df.copy()
        
        # Basic home/away split (enhance existing)
        home_mask = df['home_away'] == 'home'
        away_mask = df['home_away'] == 'away'
        
        # Home vs away performance differentials
        df['home_points_avg'] = (
            df[home_mask].groupby('player_id')['total_points']
            .transform('mean')
        )
        
        df['away_points_avg'] = (
            df[away_mask].groupby('player_id')['total_points']
            .transform('mean')
        )
        
        df['home_advantage'] = (
            df['home_points_avg'] - df['away_points_avg']
        ).fillna(0)
        
        # Position-specific home advantage
        df['home_advantage_gkp'] = 0  # GKPs often better at home (familiar goal)
        df['home_advantage_def'] = 0  # DEFs benefit from crowd/familiar backline
        df['home_advantage_mid'] = 0  # MIDs benefit from home crowd energy
        df['home_advantage_fwd'] = 0  # FWDs benefit from home crowd support
        
        # Update based on position
        for position, advantage in [(1, 0.2), (2, 0.3), (3, 0.15), (4, 0.25)]:
            position_mask = df['element_type'] == position
            home_position = home_mask & position_mask
            
            if position == 1:
                df.loc[home_position, 'home_advantage_gkp'] = advantage
            elif position == 2:
                df.loc[home_position, 'home_advantage_def'] = advantage
            elif position == 3:
                df.loc[home_position, 'home_advantage_mid'] = advantage
            elif position == 4:
                df.loc[home_position, 'home_advantage_fwd'] = advantage
        
        # Home fixture run (next N home games)
        df['next_3_home_games'] = 0  # Count of home games in next 3
        df['next_5_home_games'] = 0  # Count of home games in next 5
        
        # Away fixture run
        df['next_3_away_games'] = 0  # Count of away games in next 3
        df['next_5_away_games'] = 0  # Count of away games in next 5
        
        # Home/away streaks
        df['current_home_streak'] = 0  # Consecutive home games
        df['current_away_streak'] = 0  # Consecutive away games
        
        # Venue-specific form
        for venue in ['home', 'away']:
            venue_mask = df['home_away'] == venue
            df[f'{venue}_form_last_3'] = (
                df[venue_mask].groupby('player_id')['total_points']
                .rolling(window=3, min_periods=1)
                .mean()
                .reindex(df.index)
                .fillna(df['total_points'])  # Fill with current game if no history
            )
        
        # Venue comfort rating
        df['venue_comfort'] = np.where(
            home_mask, 
            df['home_form_last_3'],
            df['away_form_last_3']
        )
        
        return df

In [None]:
# Practical Implementation Example - Load Real 2025/26 Data
def load_and_process_current_season():
    """
    Load and process the current 2025/26 season data with 4 rounds completed
    """
    # Initialize feature engineer
    feature_engineer = FootballFeatureEngineer()
    
    # Load data from our database
    players_df, teams_df, matches_df = feature_engineer.load_data_from_db()
    
    print(f"📊 Loaded data:")
    print(f"   • Players records: {len(players_df)}")
    print(f"   • Teams records: {len(teams_df)}")
    print(f"   • Matches records: {len(matches_df)}")
    print(f"   • Gameweeks available: {players_df['gameweek'].nunique()}")
    
    # Apply feature engineering
    engineered_df = feature_engineer.create_all_features(players_df, teams_df, matches_df)
    
    # Feature summary
    summary = feature_engineer.get_feature_summary(engineered_df)
    print("\n📈 Feature Engineering Summary:")
    for category, count in summary.items():
        print(f"   • {category}: {count}")
    
    return engineered_df, feature_engineer

# Execute the feature engineering
# engineered_data, feature_eng = load_and_process_current_season()

In [None]:
def analyze_position_specific_performance(df, position='FWD'):
    """
    Detailed analysis of position-specific features
    """
    position_data = df[df['position_name'] == position].copy()
    
    print(f"🎯 {position} Performance Analysis:")
    print(f"   • Total players: {position_data['player_id'].nunique()}")
    print(f"   • Data points: {len(position_data)}")
    
    if position == 'FWD':
        key_metrics = ['goals_per_90', 'shot_conversion_rate', 'attacking_threat', 
                      'clinical_finishing', 'fwd_differential_score']
    elif position == 'MID':
        key_metrics = ['creativity_index', 'box_to_box_rating', 'attacking_mid_potential',
                      'key_passes_per_90', 'work_rate_index']
    elif position == 'DEF':
        key_metrics = ['defensive_reliability', 'attacking_threat_def', 'def_bonus_potential',
                      'defensive_actions_per_90', 'pass_accuracy']
    elif position == 'GKP':
        key_metrics = ['save_percentage', 'gkp_value_rating', 'clean_sheet_rate',
                      'saves_per_90', 'distribution_accuracy']
    
    print(f"\n📊 Key {position} Metrics (Top 5 players):")
    for metric in key_metrics:
        if metric in position_data.columns:
            top_performers = (position_data.groupby('web_name')[metric]
                            .mean()
                            .sort_values(ascending=False)
                            .head(5))
            print(f"\n{metric}:")
            for player, value in top_performers.items():
                print(f"   • {player}: {value:.2f}")
    
    return position_data

def create_feature_correlation_analysis(df):
    """
    Analyze feature correlations with target variable (total_points)
    """
    # Select numeric features only
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove id columns and target
    feature_cols = [col for col in numeric_features 
                   if not any(x in col.lower() for x in ['id', 'index'])]
    
    if 'total_points' in feature_cols:
        # Calculate correlations with total_points
        correlations = df[feature_cols].corr()['total_points'].abs().sort_values(ascending=False)
        
        print("🔗 Top 20 Features Correlated with Total Points:")
        for i, (feature, corr) in enumerate(correlations.head(20).items()):
            if feature != 'total_points':
                print(f"   {i+1:2d}. {feature}: {corr:.3f}")
        
        return correlations
    
    return None

def prepare_features_for_modeling(df, target_col='total_points'):
    """
    Prepare final feature set for machine learning models
    """
    # Remove non-predictive columns
    exclude_cols = [
        'player_id', 'web_name', 'team_id', 'gameweek', 'match_id',
        'first_name', 'second_name', 'position_name', 'season_phase',
        target_col  # Remove target from features
    ]
    
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Handle categorical variables
    categorical_cols = df[feature_cols].select_dtypes(include=['object']).columns.tolist()
    
    # Prepare final dataset
    features_df = df[feature_cols].copy()
    
    # Fill missing values
    features_df = features_df.fillna(features_df.median())
    
    print(f"🎯 Final Feature Set Prepared:")
    print(f"   • Total features: {len(feature_cols)}")
    print(f"   • Categorical features: {len(categorical_cols)}")
    print(f"   • Numeric features: {len(feature_cols) - len(categorical_cols)}")
    print(f"   • Target variable: {target_col}")
    print(f"   • Data shape: {features_df.shape}")
    
    return features_df, df[target_col], feature_cols, categorical_cols

# Example usage for the current 2025/26 season analysis
def run_feature_engineering_demo():
    """
    Demonstration of the complete feature engineering pipeline
    """
    print("🚀 Running Feature Engineering Demo for 2025/26 Season")
    print("=" * 60)
    
    # Load and process data
    engineered_data, feature_eng = load_and_process_current_season()
    
    # Analyze each position
    for position in ['FWD', 'MID', 'DEF', 'GKP']:
        print(f"\n{'-'*40}")
        analyze_position_specific_performance(engineered_data, position)
    
    # Feature correlation analysis
    print(f"\n{'-'*40}")
    correlations = create_feature_correlation_analysis(engineered_data)
    
    # Prepare for modeling
    print(f"\n{'-'*40}")
    X, y, feature_names, categorical_features = prepare_features_for_modeling(engineered_data)
    
    print(f"\n✅ Feature engineering complete and ready for modeling!")
    
    return X, y, feature_names, categorical_features, engineered_data

# Uncomment to run the demo
# X, y, feature_names, categorical_features, full_data = run_feature_engineering_demo()

In [None]:
# Enhanced Feature Demonstration for Your Specific Requirements
def demonstrate_enhanced_features():
    """
    Demonstrate the specific enhanced features requested:
    1. Form indicators (last 5 games performance)
    2. Opposition difficulty ratings  
    3. Fatigue metrics (minutes played recently)
    4. Home advantage factors
    """
    
    print("🎯 Demonstrating Enhanced Features for 2025/26 Season")
    print("=" * 60)
    
    # Load sample data
    players_df = pd.read_csv('sample_players_data.csv')
    
    # Create mock teams and matches data
    teams_data = []
    matches_data = []
    
    for gw in range(1, 5):  # 4 gameweeks
        for team_id in range(1, 21):  # 20 teams
            teams_data.append({
                'gameweek': gw,
                'team_id': team_id,
                'strength_overall_home': np.random.uniform(1000, 1400),
                'strength_overall_away': np.random.uniform(900, 1300),
                'strength_attack_home': np.random.uniform(1000, 1400),
                'strength_attack_away': np.random.uniform(900, 1300),
                'strength_defence_home': np.random.uniform(1000, 1400),
                'strength_defence_away': np.random.uniform(900, 1300)
            })
            
            matches_data.append({
                'gameweek': gw,
                'team_id': team_id,
                'difficulty': np.random.randint(1, 6),
                'home_away': np.random.choice(['home', 'away']),
                'opponent': f'Team_{np.random.randint(1, 21)}'
            })
    
    teams_df = pd.DataFrame(teams_data)
    matches_df = pd.DataFrame(matches_data)
    
    # Apply enhanced feature engineering
    engineer = FootballFeatureEngineer()
    
    print("\\n🔧 Applying Enhanced Feature Engineering...")
    
    # Test each new feature set
    df_enhanced = engineer.create_position_specific_features(players_df)
    print("✅ Base position features applied")
    
    df_enhanced = engineer.create_enhanced_form_indicators(df_enhanced)
    print("✅ Enhanced 5-game form indicators created")
    
    df_enhanced = engineer.create_opposition_difficulty_ratings(df_enhanced, teams_df)
    print("✅ Opposition difficulty ratings created")
    
    df_enhanced = engineer.create_fatigue_metrics(df_enhanced)
    print("✅ Fatigue metrics created")
    
    df_enhanced = engineer.create_home_advantage_factors(df_enhanced)
    print("✅ Home advantage factors created")
    
    print(f"\\n📊 Enhanced Features Summary:")
    print(f"   • Original columns: {len(players_df.columns)}")
    print(f"   • Enhanced columns: {len(df_enhanced.columns)}")
    print(f"   • New features added: {len(df_enhanced.columns) - len(players_df.columns)}")
    
    # Demonstrate specific features
    print("\\n🎯 1. Form Indicators (Last 5 Games):")
    form_features = [col for col in df_enhanced.columns if 'form_5' in col]
    print(f"   • Created {len(form_features)} form indicators:")
    for feature in form_features[:5]:  # Show first 5
        avg_val = df_enhanced[feature].mean()
        print(f"     - {feature}: avg {avg_val:.2f}")
    
    print("\\n🎯 2. Opposition Difficulty Ratings:")
    difficulty_features = [col for col in df_enhanced.columns if any(x in col for x in ['next_3_fixtures', 'next_5_fixtures', 'difficulty_weighted'])]
    print(f"   • Created {len(difficulty_features)} difficulty features:")
    for feature in difficulty_features:
        avg_val = df_enhanced[feature].mean()
        print(f"     - {feature}: avg {avg_val:.2f}")
    
    print("\\n🎯 3. Fatigue Metrics:")
    fatigue_features = [col for col in df_enhanced.columns if any(x in col for x in ['minutes_last', 'workload', 'rotation', 'overplay'])]
    print(f"   • Created {len(fatigue_features)} fatigue features:")
    for feature in fatigue_features:
        avg_val = df_enhanced[feature].mean()
        print(f"     - {feature}: avg {avg_val:.2f}")
    
    print("\\n🎯 4. Home Advantage Factors:")
    home_features = [col for col in df_enhanced.columns if any(x in col for x in ['home_advantage', 'venue', 'home_form', 'away_form'])]
    print(f"   • Created {len(home_features)} home advantage features:")
    for feature in home_features:
        avg_val = df_enhanced[feature].mean()
        print(f"     - {feature}: avg {avg_val:.2f}")
    
    # Show some practical insights
    print("\\n💡 Practical Insights from Enhanced Features:")
    
    # Players with best 5-game form
    latest_gw = df_enhanced['gameweek'].max()
    latest_data = df_enhanced[df_enhanced['gameweek'] == latest_gw]
    
    if 'form_5_avg_points' in latest_data.columns:
        top_form = latest_data.nlargest(5, 'form_5_avg_points')[['web_name', 'position_name', 'form_5_avg_points']]
        print("\\n🔥 Top 5-Game Form Players:")
        for _, player in top_form.iterrows():
            print(f"   • {player['web_name']} ({player['position_name']}): {player['form_5_avg_points']:.2f} pts/game")
    
    # Players with easiest upcoming fixtures
    if 'next_3_fixtures_difficulty' in latest_data.columns:
        easy_fixtures = latest_data.nsmallest(5, 'next_3_fixtures_difficulty')[['web_name', 'position_name', 'next_3_fixtures_difficulty']]
        print("\\n🎯 Easiest Next 3 Fixtures:")
        for _, player in easy_fixtures.iterrows():
            print(f"   • {player['web_name']} ({player['position_name']}): {player['next_3_fixtures_difficulty']:.2f} difficulty")
    
    # Players with rotation risk
    if 'rotation_risk' in latest_data.columns:
        rotation_risk = latest_data[latest_data['rotation_risk'] == 1][['web_name', 'position_name', 'avg_minutes_last_3']]
        print(f"\\n⚠️  Rotation Risk Players: {len(rotation_risk)}")
        if len(rotation_risk) > 0:
            for _, player in rotation_risk.head(3).iterrows():
                print(f"   • {player['web_name']} ({player['position_name']}): {player['avg_minutes_last_3']:.1f} min/game")
    
    print("\\n✅ Enhanced Feature Demonstration Complete!")
    return df_enhanced

# Uncomment to run the demonstration
# enhanced_data = demonstrate_enhanced_features()

In [None]:
class FootballDataPreprocessor:
    """
    Preprocess football data for machine learning
    """
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        
    def prepare_data(self, df, target_col, categorical_cols=None, date_col='date'):
        """
        Prepare data for machine learning
        """
        processed_df = df.copy()
        
        # Handle categorical variables
        if categorical_cols:
            for col in categorical_cols:
                if col in processed_df.columns:
                    le = LabelEncoder()
                    processed_df[col] = le.fit_transform(processed_df[col].astype(str))
                    self.label_encoders[col] = le
        
        # Separate features and target
        feature_cols = [col for col in processed_df.columns 
                       if col not in [target_col, date_col, 'player_id', 'match_id']]
        
        X = processed_df[feature_cols]
        y = processed_df[target_col]
        
        # Handle missing values
        X = X.fillna(X.median())
        
        return X, y, feature_cols
    
    def split_data_temporal(self, X, y, df, date_col='date', test_size=0.2):
        """
        Split data temporally (important for time series)
        """
        # Sort by date
        dates = pd.to_datetime(df[date_col])
        split_date = dates.quantile(1 - test_size)
        
        train_mask = dates < split_date
        test_mask = dates >= split_date
        
        X_train, X_test = X[train_mask], X[test_mask]
        y_train, y_test = y[train_mask], y[test_mask]
        
        # Further split training into train/validation
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )
        
        return X_train, X_val, X_test, y_train, y_val, y_test
    
    def scale_features(self, X_train, X_val, X_test):
        """
        Scale features using training data statistics
        """
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_val_scaled = self.scaler.transform(X_val)
        X_test_scaled = self.scaler.transform(X_test)
        
        return X_train_scaled, X_val_scaled, X_test_scaled

# TODO: Load and prepare your data
# preprocessor = FootballDataPreprocessor()
# X, y, feature_cols = preprocessor.prepare_data(data, 'next_match_rating')
# X_train, X_val, X_test, y_train, y_val, y_test = preprocessor.split_data_temporal(X, y, data)

## 2. Baseline Models

In [None]:
class BaselineModels:
    """
    Implement baseline models for comparison
    """
    
    def __init__(self):
        self.models = {}
        self.scores = {}
    
    def train_linear_models(self, X_train, y_train, X_val, y_val):
        """
        Train linear regression models
        """
        models = {
            'Linear Regression': LinearRegression(),
            'Ridge Regression': Ridge(alpha=1.0),
            'Lasso Regression': Lasso(alpha=0.1)
        }
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            
            self.models[name] = model
            self.scores[name] = {'MSE': mse, 'MAE': mae, 'R2': r2}
            
            print(f"{name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")
    
    def train_tree_models(self, X_train, y_train, X_val, y_val):
        """
        Train tree-based models
        """
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
        }
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            
            self.models[name] = model
            self.scores[name] = {'MSE': mse, 'MAE': mae, 'R2': r2}
            
            print(f"{name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

# TODO: Train baseline models
# baseline = BaselineModels()
# baseline.train_linear_models(X_train, y_train, X_val, y_val)
# baseline.train_tree_models(X_train, y_train, X_val, y_val)

## 3. Advanced Models - XGBoost

In [None]:
class XGBoostModel:
    """
    XGBoost model with hyperparameter tuning
    """
    
    def __init__(self):
        self.model = None
        self.best_params = None
    
    def train(self, X_train, y_train, X_val, y_val, params=None):
        """
        Train XGBoost model
        """
        if params is None:
            params = {
                'n_estimators': 1000,
                'max_depth': 6,
                'learning_rate': 0.1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'random_state': 42
            }
        
        self.model = xgb.XGBRegressor(**params)
        
        # Train with early stopping
        self.model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=50,
            verbose=False
        )
        
        return self.model
    
    def predict(self, X):
        """
        Make predictions
        """
        return self.model.predict(X)
    
    def get_feature_importance(self, feature_names):
        """
        Get feature importance
        """
        importance = self.model.feature_importances_
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)
        
        return feature_importance
    
    def plot_feature_importance(self, feature_names, top_n=20):
        """
        Plot feature importance
        """
        feature_importance = self.get_feature_importance(feature_names)
        
        plt.figure(figsize=(10, 8))
        sns.barplot(data=feature_importance.head(top_n), 
                   x='importance', y='feature')
        plt.title('XGBoost Feature Importance')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.show()
        
        return feature_importance

# TODO: Train XGBoost model
# xgb_model = XGBoostModel()
# xgb_model.train(X_train, y_train, X_val, y_val)
# xgb_predictions = xgb_model.predict(X_test)
# xgb_importance = xgb_model.plot_feature_importance(feature_cols)

## 4. Neural Network Models

In [None]:
class NeuralNetworkModel:
    """
    Deep learning model for player performance prediction
    """
    
    def __init__(self, input_dim):
        self.input_dim = input_dim
        self.model = None
        self.history = None
    
    def build_model(self, hidden_layers=[128, 64, 32], dropout_rate=0.3):
        """
        Build neural network architecture
        """
        model = keras.Sequential()
        
        # Input layer
        model.add(keras.layers.Dense(hidden_layers[0], 
                                   input_dim=self.input_dim, 
                                   activation='relu'))
        model.add(keras.layers.Dropout(dropout_rate))
        
        # Hidden layers
        for units in hidden_layers[1:]:
            model.add(keras.layers.Dense(units, activation='relu'))
            model.add(keras.layers.Dropout(dropout_rate))
        
        # Output layer
        model.add(keras.layers.Dense(1, activation='linear'))
        
        # Compile model
        model.compile(optimizer='adam', 
                     loss='mse', 
                     metrics=['mae'])
        
        self.model = model
        return model
    
    def train(self, X_train, y_train, X_val, y_val, epochs=100, batch_size=32):
        """
        Train the neural network
        """
        # Callbacks
        early_stopping = keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=15, restore_best_weights=True
        )
        
        reduce_lr = keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6
        )
        
        # Train model
        self.history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )
        
        return self.history
    
    def predict(self, X):
        """
        Make predictions
        """
        return self.model.predict(X).flatten()
    
    def plot_training_history(self):
        """
        Plot training history
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Loss
        ax1.plot(self.history.history['loss'], label='Training Loss')
        ax1.plot(self.history.history['val_loss'], label='Validation Loss')
        ax1.set_title('Model Loss')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.legend()
        
        # MAE
        ax2.plot(self.history.history['mae'], label='Training MAE')
        ax2.plot(self.history.history['val_mae'], label='Validation MAE')
        ax2.set_title('Model MAE')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('MAE')
        ax2.legend()
        
        plt.tight_layout()
        plt.show()

# TODO: Train neural network
# nn_model = NeuralNetworkModel(input_dim=X_train.shape[1])
# nn_model.build_model()
# nn_history = nn_model.train(X_train_scaled, y_train, X_val_scaled, y_val)
# nn_predictions = nn_model.predict(X_test_scaled)
# nn_model.plot_training_history()

## 5. Time Series Models

In [None]:
class TimeSeriesModels:
    """
    Time series models for player performance forecasting
    """
    
    def __init__(self):
        self.prophet_models = {}
        self.lstm_model = None
    
    def train_prophet_per_player(self, df, player_col='player_id', 
                                date_col='date', target_col='rating'):
        """
        Train Prophet model for each player
        """
        players = df[player_col].unique()
        
        for player in players[:10]:  # Limit to first 10 players for demo
            player_data = df[df[player_col] == player].copy()
            
            if len(player_data) < 10:  # Need minimum data points
                continue
            
            # Prepare data for Prophet
            prophet_data = player_data[[date_col, target_col]].copy()
            prophet_data.columns = ['ds', 'y']
            prophet_data['ds'] = pd.to_datetime(prophet_data['ds'])
            
            # Train Prophet model
            model = Prophet(
                daily_seasonality=False,
                weekly_seasonality=True,
                yearly_seasonality=False,
                changepoint_prior_scale=0.05
            )
            
            model.fit(prophet_data)
            self.prophet_models[player] = model
            
            print(f"Trained Prophet model for player {player}")
        
        return self.prophet_models
    
    def predict_prophet(self, player_id, future_dates):
        """
        Make predictions using Prophet model
        """
        if player_id not in self.prophet_models:
            return None
        
        model = self.prophet_models[player_id]
        
        # Create future dataframe
        future_df = pd.DataFrame({'ds': pd.to_datetime(future_dates)})
        
        # Make predictions
        forecast = model.predict(future_df)
        
        return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    
    def create_lstm_sequences(self, data, sequence_length=10):
        """
        Create sequences for LSTM training
        """
        X, y = [], []
        
        for i in range(len(data) - sequence_length):
            X.append(data[i:(i + sequence_length)])
            y.append(data[i + sequence_length])
        
        return np.array(X), np.array(y)
    
    def build_lstm_model(self, input_shape):
        """
        Build LSTM model for time series prediction
        """
        model = keras.Sequential([
            keras.layers.LSTM(50, return_sequences=True, input_shape=input_shape),
            keras.layers.Dropout(0.2),
            keras.layers.LSTM(50, return_sequences=False),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(25),
            keras.layers.Dense(1)
        ])
        
        model.compile(optimizer='adam', loss='mse', metrics=['mae'])
        
        self.lstm_model = model
        return model

# TODO: Train time series models
# ts_models = TimeSeriesModels()
# prophet_models = ts_models.train_prophet_per_player(player_time_series_data)
# 
# # For LSTM
# sequence_length = 10
# X_lstm, y_lstm = ts_models.create_lstm_sequences(player_ratings, sequence_length)
# lstm_model = ts_models.build_lstm_model((sequence_length, 1))

## 6. Model Evaluation and Comparison

In [None]:
def evaluate_models(models, X_test, y_test, model_names):
    """
    Evaluate and compare multiple models
    """
    results = []
    
    for i, (model, name) in enumerate(zip(models, model_names)):
        if hasattr(model, 'predict'):
            y_pred = model.predict(X_test)
        else:
            y_pred = model(X_test)  # For custom prediction functions
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results.append({
            'Model': name,
            'MSE': mse,
            'MAE': mae,
            'R2': r2,
            'RMSE': np.sqrt(mse)
        })
        
        print(f"{name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")
    
    results_df = pd.DataFrame(results)
    return results_df

def plot_model_comparison(results_df):
    """
    Plot model comparison
    """
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    metrics = ['MSE', 'MAE', 'R2', 'RMSE']
    
    for i, metric in enumerate(metrics):
        ax = axes[i//2, i%2]
        
        sns.barplot(data=results_df, x='Model', y=metric, ax=ax)
        ax.set_title(f'{metric} Comparison')
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

def plot_predictions_vs_actual(y_true, y_pred, title="Predictions vs Actual"):
    """
    Plot predictions vs actual values
    """
    plt.figure(figsize=(10, 8))
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()

# TODO: Evaluate all models
# all_models = [baseline.models['Random Forest'], xgb_model.model, nn_model.model]
# model_names = ['Random Forest', 'XGBoost', 'Neural Network']
# results = evaluate_models(all_models, X_test, y_test, model_names)
# plot_model_comparison(results)

## 7. Ensemble Methods

In [None]:
class EnsembleModel:
    """
    Ensemble multiple models for better predictions
    """
    
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights if weights else [1/len(models)] * len(models)
    
    def predict(self, X):
        """
        Make ensemble predictions
        """
        predictions = []
        
        for model in self.models:
            if hasattr(model, 'predict'):
                pred = model.predict(X)
            else:
                pred = model(X)
            predictions.append(pred)
        
        # Weighted average
        ensemble_pred = np.average(predictions, axis=0, weights=self.weights)
        
        return ensemble_pred
    
    def optimize_weights(self, X_val, y_val):
        """
        Optimize ensemble weights using validation data
        """
        from scipy.optimize import minimize
        
        def objective(weights):
            # Normalize weights
            weights = weights / np.sum(weights)
            
            # Make predictions
            predictions = []
            for model in self.models:
                if hasattr(model, 'predict'):
                    pred = model.predict(X_val)
                else:
                    pred = model(X_val)
                predictions.append(pred)
            
            ensemble_pred = np.average(predictions, axis=0, weights=weights)
            mse = mean_squared_error(y_val, ensemble_pred)
            
            return mse
        
        # Initial weights
        initial_weights = np.array([1/len(self.models)] * len(self.models))
        
        # Constraints (weights sum to 1)
        constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
        bounds = [(0, 1) for _ in range(len(self.models))]
        
        # Optimize
        result = minimize(objective, initial_weights, method='SLSQP', 
                         bounds=bounds, constraints=constraints)
        
        self.weights = result.x
        print(f"Optimized weights: {self.weights}")
        
        return self.weights

# TODO: Create ensemble model
# ensemble = EnsembleModel([baseline.models['Random Forest'], xgb_model.model, nn_model.model])
# optimized_weights = ensemble.optimize_weights(X_val, y_val)
# ensemble_predictions = ensemble.predict(X_test)