# Feature Engineering for Football Analytics

This notebook focuses on creating position-specific features and advanced metrics for player performance prediction.

## Contents
1. Position-Specific Feature Creation
2. Form Indicators and Rolling Statistics
3. Opposition Difficulty Ratings
4. Fatigue and Load Management Metrics
5. Home Advantage and Context Features
6. Feature Selection and Importance

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Position-Specific Feature Engineering

In [3]:
# Load historical FPL data for feature engineering
import sys
import sqlite3
from pathlib import Path

sys.path.append('../../')

def load_historical_data():
    """
    Load 2024/25 historical data for 2025/26 season prediction model training
    """
    print("🔄 Loading historical FPL data for feature engineering...")
    
    db_path = '../../data/fpl_data.db'
    
    if not Path(db_path).exists():
        raise FileNotFoundError(f"Database not found: {db_path}")
    
    conn = sqlite3.connect(db_path)
    
    # Load gameweek performance data
    players_df = pd.read_sql_query("""
        SELECT * FROM player_gameweeks 
        ORDER BY gameweek, player_id
    """, conn)
    
    # Load player metadata
    players_meta = pd.read_sql_query("""
        SELECT * FROM players_current
    """, conn)
    
    conn.close()
    
    print(f"✅ Loaded {len(players_df)} performance records")
    print(f"✅ Loaded {len(players_meta)} player profiles")
    print(f"📊 Gameweeks: {sorted(players_df['gameweek'].unique())}")
    print(f"🎯 Players: {players_df['player_id'].nunique()}")
    
    return players_df, players_meta

# Load the data
players_df, players_meta = load_historical_data()

# Show basic data structure
print(f"\n📋 Performance Data Structure:")
print(f"Columns: {list(players_df.columns)}")
print(f"\n📋 Sample Records:")
print(players_df[['web_name', 'element_type', 'gameweek', 'total_points', 'minutes']].head())

class PositionFeatureEngineer:
    """
    Create position-specific features for football players using real FPL data
    """
    
    def __init__(self):
        self.position_mapping = {1: 'GKP', 2: 'DEF', 3: 'MID', 4: 'FWD'}
        
        # Position-specific feature weights and importance
        self.position_weights = {
            'GKP': {
                'saves': 1.0,
                'clean_sheets': 1.0,
                'goals_conceded': -1.0,
                'minutes': 0.8
            },
            'DEF': {
                'tackles': 1.0,
                'interceptions': 1.0,
                'clearances': 0.9,
                'clean_sheets': 0.8,
                'goals_scored': 0.7
            },
            'MID': {
                'key_passes': 1.0,
                'passes_completed': 0.8,
                'assists': 1.0,
                'dribbles_completed': 0.7,
                'goals_scored': 0.9
            },
            'FWD': {
                'goals_scored': 1.0,
                'assists': 0.8,
                'shots_on_target': 0.9,
                'shots': 0.8,
                'minutes': 0.7
            }
        }
    
    def create_goalkeeper_features(self, df):
        """
        Create goalkeeper-specific features
        """
        print("🥅 Engineering goalkeeper features...")
        gk_df = df[df['element_type'] == 1].copy()
        
        if len(gk_df) == 0:
            return pd.DataFrame()
        
        # Save percentage (handles division by zero)
        gk_df['save_percentage'] = np.where(
            (gk_df['saves'] + gk_df['goals_conceded']) > 0,
            gk_df['saves'] / (gk_df['saves'] + gk_df['goals_conceded']),
            0
        )
        
        # Clean sheet probability based on recent form
        gk_df['clean_sheet_rate'] = gk_df.groupby('player_id')['clean_sheets'].transform(
            lambda x: x.rolling(window=3, min_periods=1).mean()
        )
        
        # Goals conceded per 90 minutes
        gk_df['goals_conceded_per_90'] = np.where(
            gk_df['minutes'] > 0,
            (gk_df['goals_conceded'] * 90) / gk_df['minutes'],
            0
        )
        
        # Save points efficiency
        gk_df['save_efficiency'] = np.where(
            gk_df['saves'] > 0,
            gk_df['total_points'] / gk_df['saves'],
            0
        )
        
        print(f"✅ Created features for {len(gk_df)} goalkeeper records")
        return gk_df
    
    def create_defender_features(self, df):
        """
        Create defender-specific features
        """
        print("🛡️ Engineering defender features...")
        def_df = df[df['element_type'] == 2].copy()
        
        if len(def_df) == 0:
            return pd.DataFrame()
        
        # Defensive actions per 90 minutes
        def_df['defensive_actions_per_90'] = np.where(
            def_df['minutes'] > 0,
            ((def_df['tackles'] + def_df['interceptions'] + def_df['clearances']) * 90) / def_df['minutes'],
            0
        )
        
        # Clean sheet contribution
        def_df['clean_sheet_rate'] = def_df.groupby('player_id')['clean_sheets'].transform(
            lambda x: x.rolling(window=4, min_periods=1).mean()
        )
        
        # Aerial duel success rate
        def_df['aerial_success_rate'] = np.where(
            def_df['aerial_duels_attempted'] > 0,
            def_df['aerial_duels_won'] / def_df['aerial_duels_attempted'],
            0
        )
        
        # Attack contribution (goals + assists)
        def_df['attacking_returns'] = def_df['goals_scored'] + def_df['assists']
        
        # Pass completion rate
        def_df['pass_completion_rate'] = np.where(
            def_df['passes_attempted'] > 0,
            def_df['passes_completed'] / def_df['passes_attempted'],
            0
        )
        
        print(f"✅ Created features for {len(def_df)} defender records")
        return def_df
    
    def create_midfielder_features(self, df):
        """
        Create midfielder-specific features
        """
        print("⚽ Engineering midfielder features...")
        mid_df = df[df['element_type'] == 3].copy()
        
        if len(mid_df) == 0:
            return pd.DataFrame()
        
        # Creativity metrics
        mid_df['creativity_index'] = (
            mid_df['key_passes'] * 2 + 
            mid_df['assists'] * 3 + 
            mid_df['dribbles_completed']
        )
        
        # Pass completion and volume
        mid_df['pass_completion_rate'] = np.where(
            mid_df['passes_attempted'] > 0,
            mid_df['passes_completed'] / mid_df['passes_attempted'],
            0
        )
        
        mid_df['passes_per_90'] = np.where(
            mid_df['minutes'] > 0,
            (mid_df['passes_completed'] * 90) / mid_df['minutes'],
            0
        )
        
        # Dribble success rate
        mid_df['dribble_success_rate'] = np.where(
            mid_df['dribbles_attempted'] > 0,
            mid_df['dribbles_completed'] / mid_df['dribbles_attempted'],
            0
        )
        
        # Goal involvement (goals + assists)
        mid_df['goal_involvement'] = mid_df['goals_scored'] + mid_df['assists']
        
        # Shots per 90 (attacking threat)
        mid_df['shots_per_90'] = np.where(
            mid_df['minutes'] > 0,
            (mid_df['shots'] * 90) / mid_df['minutes'],
            0
        )
        
        print(f"✅ Created features for {len(mid_df)} midfielder records")
        return mid_df
    
    def create_forward_features(self, df):
        """
        Create forward-specific features
        """
        print("⚡ Engineering forward features...")
        fwd_df = df[df['element_type'] == 4].copy()
        
        if len(fwd_df) == 0:
            return pd.DataFrame()
        
        # Shot conversion rate
        fwd_df['shot_conversion_rate'] = np.where(
            fwd_df['shots'] > 0,
            fwd_df['goals_scored'] / fwd_df['shots'],
            0
        )
        
        # Shots on target rate
        fwd_df['shots_on_target_rate'] = np.where(
            fwd_df['shots'] > 0,
            fwd_df['shots_on_target'] / fwd_df['shots'],
            0
        )
        
        # Goals per 90 minutes
        fwd_df['goals_per_90'] = np.where(
            fwd_df['minutes'] > 0,
            (fwd_df['goals_scored'] * 90) / fwd_df['minutes'],
            0
        )
        
        # Total attacking returns
        fwd_df['attacking_returns'] = fwd_df['goals_scored'] + fwd_df['assists']
        
        # Minutes per goal (lower is better)
        fwd_df['minutes_per_goal'] = np.where(
            fwd_df['goals_scored'] > 0,
            fwd_df['minutes'] / fwd_df['goals_scored'],
            999  # High value for players with no goals
        )
        
        # Expected vs actual (using shots as proxy for xG)
        fwd_df['finishing_ability'] = np.where(
            fwd_df['shots_on_target'] > 0,
            fwd_df['goals_scored'] / fwd_df['shots_on_target'],
            0
        )
        
        print(f"✅ Created features for {len(fwd_df)} forward records")
        return fwd_df

# Initialize the feature engineer
feature_engineer = PositionFeatureEngineer()

print(f"\n🔧 Position-specific feature engineering initialized!")
print(f"📊 Ready to process {len(players_df)} records across 4 positions")

🔄 Loading historical FPL data for feature engineering...
✅ Loaded 2960 performance records
✅ Loaded 740 player profiles
📊 Gameweeks: [np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
🎯 Players: 740

📋 Performance Data Structure:
Columns: ['gameweek', 'player_id', 'web_name', 'element_type', 'team_id', 'total_points', 'goals_scored', 'assists', 'saves', 'clean_sheets', 'minutes', 'now_cost', 'selected_by_percent', 'transfers_in', 'transfers_out', 'bonus', 'yellow_cards', 'red_cards', 'shots', 'shots_on_target', 'key_passes', 'tackles', 'interceptions', 'clearances', 'aerial_duels_won', 'aerial_duels_attempted', 'dribbles_completed', 'dribbles_attempted', 'passes_completed', 'passes_attempted', 'penalties_scored', 'penalties_attempted', 'penalties_saved', 'penalties_faced', 'goals_conceded', 'team', 'form']

📋 Sample Records:
       web_name  element_type  gameweek  total_points  minutes
0          Raya             1         1             5       90
1  Arrizabalaga             1      

## 2. Form Indicators and Rolling Statistics

In [5]:
# Create position-specific features for all player types
print("🔧 Creating position-specific features...")

# Create features for each position
gk_features = feature_engineer.create_goalkeeper_features(players_df)
def_features = feature_engineer.create_defender_features(players_df)
mid_features = feature_engineer.create_midfielder_features(players_df)
fwd_features = feature_engineer.create_forward_features(players_df)

# Combine all position-specific features
enhanced_df = pd.concat([gk_features, def_features, mid_features, fwd_features], ignore_index=True)

print(f"\n📊 Position-specific feature creation complete!")
print(f"✅ Total enhanced records: {len(enhanced_df)}")
print(f"📋 New features added per position:")

# Show sample of new features for each position
positions = {1: 'GKP', 2: 'DEF', 3: 'MID', 4: 'FWD'}

for pos_code, pos_name in positions.items():
    pos_data = enhanced_df[enhanced_df['element_type'] == pos_code]
    if len(pos_data) > 0:
        # Get columns that aren't in the original dataframe (new features)
        original_cols = set(players_df.columns)
        new_cols = [col for col in pos_data.columns if col not in original_cols]
        
        print(f"\n{pos_name} ({len(pos_data)} records):")
        print(f"   New features: {new_cols}")
        
        if len(new_cols) > 0:
            # Show sample values for the first few new features
            sample_player = pos_data.iloc[0]
            for col in new_cols[:3]:  # Show first 3 new features
                value = sample_player[col]
                print(f"   • {col}: {value:.3f}")

print(f"\n✅ Ready for next step: Form and momentum indicators!")

🔧 Creating position-specific features...
🥅 Engineering goalkeeper features...
✅ Created features for 344 goalkeeper records
🛡️ Engineering defender features...
✅ Created features for 980 defender records
⚽ Engineering midfielder features...
✅ Created features for 1312 midfielder records
⚡ Engineering forward features...
✅ Created features for 324 forward records

📊 Position-specific feature creation complete!
✅ Total enhanced records: 2960
📋 New features added per position:

GKP (344 records):
   New features: ['save_percentage', 'clean_sheet_rate', 'goals_conceded_per_90', 'save_efficiency', 'defensive_actions_per_90', 'aerial_success_rate', 'attacking_returns', 'pass_completion_rate', 'creativity_index', 'passes_per_90', 'dribble_success_rate', 'goal_involvement', 'shots_per_90', 'shot_conversion_rate', 'shots_on_target_rate', 'goals_per_90', 'minutes_per_goal', 'finishing_ability']
   • save_percentage: 1.000
   • clean_sheet_rate: 0.000
   • goals_conceded_per_90: 0.000

DEF (980 r

## 3. Opposition Difficulty Rating

In [7]:
# Create form and momentum indicators
print("📈 Creating form and momentum indicators...")

def create_form_features(df):
    """
    Create rolling form and momentum features for FPL players
    """
    form_df = df.copy()
    
    # Sort by player and gameweek to ensure proper ordering
    form_df = form_df.sort_values(['player_id', 'gameweek'])
    
    print("🔄 Calculating rolling statistics...")
    
    # Rolling averages for different windows
    windows = [3, 4, 6]  # 3, 4, and 6 gameweek windows
    
    for window in windows:
        # Points form
        form_df[f'points_form_{window}gw'] = form_df.groupby('player_id')['total_points'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Minutes consistency
        form_df[f'minutes_avg_{window}gw'] = form_df.groupby('player_id')['minutes'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Goals form (for attacking players)
        form_df[f'goals_form_{window}gw'] = form_df.groupby('player_id')['goals_scored'].transform(
            lambda x: x.rolling(window=window, min_periods=1).sum()
        )
        
        # Assists form
        form_df[f'assists_form_{window}gw'] = form_df.groupby('player_id')['assists'].transform(
            lambda x: x.rolling(window=window, min_periods=1).sum()
        )
    
    print("📊 Calculating consistency metrics...")
    
    # Consistency scores (standard deviation - lower is more consistent)
    form_df['points_consistency'] = form_df.groupby('player_id')['total_points'].transform(
        lambda x: x.rolling(window=4, min_periods=2).std().fillna(0)
    )
    
    form_df['minutes_consistency'] = form_df.groupby('player_id')['minutes'].transform(
        lambda x: x.rolling(window=4, min_periods=2).std().fillna(0)
    )
    
    print("🚀 Calculating momentum indicators...")
    
    # Momentum indicators (trend over recent games)
    form_df['points_momentum'] = form_df.groupby('player_id')['total_points'].transform(
        lambda x: x.rolling(window=3, min_periods=2).apply(
            lambda y: np.polyfit(range(len(y)), y, 1)[0] if len(y) >= 2 else 0
        ).fillna(0)
    )
    
    # Recent vs historical performance
    form_df['recent_vs_avg'] = (
        form_df['points_form_3gw'] / (form_df.groupby('player_id')['total_points'].transform('mean') + 0.01)
    )
    
    # Clean sheet momentum for defenders and goalkeepers
    def_gk_mask = form_df['element_type'].isin([1, 2])
    form_df.loc[def_gk_mask, 'clean_sheet_momentum'] = form_df.loc[def_gk_mask].groupby('player_id')['clean_sheets'].transform(
        lambda x: x.rolling(window=3, min_periods=1).sum()
    )
    
    # Attacking momentum for midfielders and forwards
    att_mask = form_df['element_type'].isin([3, 4])
    form_df.loc[att_mask, 'attacking_momentum'] = form_df.loc[att_mask].groupby('player_id').apply(
        lambda group: (group['goals_scored'] + group['assists']).rolling(window=3, min_periods=1).sum()
    ).values
    
    print("💰 Calculating value metrics...")
    
    # Value metrics
    form_df['points_per_million'] = form_df['total_points'] / form_df['now_cost']
    form_df['points_per_minute'] = np.where(
        form_df['minutes'] > 0,
        form_df['total_points'] / form_df['minutes'],
        0
    )
    
    # Form-based value
    form_df['form_value_3gw'] = form_df['points_form_3gw'] / form_df['now_cost']
    
    return form_df

# Apply form feature engineering
enhanced_df = create_form_features(enhanced_df)

print(f"\n✅ Form and momentum features created!")
print(f"📊 Enhanced dataset now has {len(enhanced_df.columns)} columns")

# Show sample of new form features
form_features = [col for col in enhanced_df.columns if any(keyword in col for keyword in ['form', 'momentum', 'consistency', 'value'])]
print(f"\n📋 Form features created: {len(form_features)}")
for feature in form_features[:10]:  # Show first 10
    print(f"   • {feature}")

# Quick validation - show a sample player's form progression
sample_player = enhanced_df[enhanced_df['minutes'] > 0].iloc[0]
print(f"\n📈 Sample player form progression:")
print(f"Player: {sample_player['web_name']} (Position: {sample_player['element_type']})")
print(f"   • 3GW Points Form: {sample_player['points_form_3gw']:.2f}")
print(f"   • Points Momentum: {sample_player['points_momentum']:.2f}")
print(f"   • Points Consistency: {sample_player['points_consistency']:.2f}")
print(f"   • Form Value: {sample_player['form_value_3gw']:.2f}")

📈 Creating form and momentum indicators...
🔄 Calculating rolling statistics...
📊 Calculating consistency metrics...
🚀 Calculating momentum indicators...
💰 Calculating value metrics...

✅ Form and momentum features created!
📊 Enhanced dataset now has 76 columns

📋 Form features created: 16
   • form
   • points_form_3gw
   • goals_form_3gw
   • assists_form_3gw
   • points_form_4gw
   • goals_form_4gw
   • assists_form_4gw
   • points_form_6gw
   • goals_form_6gw
   • assists_form_6gw

📈 Sample player form progression:
Player: Raya (Position: 1)
   • 3GW Points Form: 5.00
   • Points Momentum: 0.00
   • Points Consistency: 0.00
   • Form Value: 0.91


  form_df.loc[att_mask, 'attacking_momentum'] = form_df.loc[att_mask].groupby('player_id').apply(


## 4. Fatigue and Load Management

In [9]:
# Create comprehensive efficiency and advanced metrics
print("⚡ Creating advanced efficiency metrics...")

def create_efficiency_metrics(df):
    """
    Create comprehensive efficiency and advanced performance metrics
    """
    eff_df = df.copy()
    
    print("🎯 Calculating shooting efficiency...")
    
    # Shooting efficiency (for attacking players)
    eff_df['shot_accuracy'] = np.where(
        eff_df['shots'] > 0,
        eff_df['shots_on_target'] / eff_df['shots'],
        0
    )
    
    eff_df['big_chance_conversion'] = np.where(
        eff_df['shots_on_target'] > 0,
        eff_df['goals_scored'] / eff_df['shots_on_target'],
        0
    )
    
    print("🏃 Calculating per-90 minute metrics...")
    
    # Per 90-minute stats (normalized performance)
    stats_to_normalize = ['goals_scored', 'assists', 'shots', 'key_passes', 'tackles', 'interceptions']
    
    for stat in stats_to_normalize:
        if stat in eff_df.columns:
            eff_df[f'{stat}_per_90'] = np.where(
                eff_df['minutes'] > 0,
                (eff_df[stat] * 90) / eff_df['minutes'],
                0
            )
    
    print("🎲 Calculating involvement metrics...")
    
    # Team involvement metrics
    eff_df['goal_involvement'] = eff_df['goals_scored'] + eff_df['assists']
    eff_df['attacking_actions'] = eff_df['shots'] + eff_df['key_passes'] + eff_df['dribbles_completed']
    eff_df['defensive_actions'] = eff_df['tackles'] + eff_df['interceptions'] + eff_df['clearances']
    
    print("📊 Calculating advanced ratios...")
    
    # Advanced performance ratios
    eff_df['pass_accuracy'] = np.where(
        eff_df['passes_attempted'] > 0,
        eff_df['passes_completed'] / eff_df['passes_attempted'],
        0
    )
    
    eff_df['dribble_success_rate'] = np.where(
        eff_df['dribbles_attempted'] > 0,
        eff_df['dribbles_completed'] / eff_df['dribbles_attempted'],
        0
    )
    
    eff_df['aerial_win_rate'] = np.where(
        eff_df['aerial_duels_attempted'] > 0,
        eff_df['aerial_duels_won'] / eff_df['aerial_duels_attempted'],
        0
    )
    
    print("💎 Calculating quality metrics...")
    
    # Quality indicators
    eff_df['minutes_per_point'] = np.where(
        eff_df['total_points'] > 0,
        eff_df['minutes'] / eff_df['total_points'],
        999  # High value for players with no points
    )
    
    # Bonus point efficiency (indicator of impactful performance)
    eff_df['bonus_efficiency'] = np.where(
        eff_df['minutes'] > 0,
        eff_df['bonus'] / (eff_df['minutes'] / 90),
        0
    )
    
    # Price performance ratio
    eff_df['price_performance_ratio'] = eff_df['total_points'] / eff_df['now_cost']
    
    print("🎪 Calculating position-specific efficiency...")
    
    # Position-specific efficiency metrics
    # Goalkeeper efficiency
    gk_mask = eff_df['element_type'] == 1
    eff_df.loc[gk_mask, 'gk_efficiency'] = np.where(
        eff_df.loc[gk_mask, 'minutes'] > 0,
        (eff_df.loc[gk_mask, 'saves'] * 0.3 + 
         eff_df.loc[gk_mask, 'clean_sheets'] * 4 - 
         eff_df.loc[gk_mask, 'goals_conceded'] * 1) / (eff_df.loc[gk_mask, 'minutes'] / 90),
        0
    )
    
    # Defender efficiency  
    def_mask = eff_df['element_type'] == 2
    eff_df.loc[def_mask, 'def_efficiency'] = np.where(
        eff_df.loc[def_mask, 'minutes'] > 0,
        (eff_df.loc[def_mask, 'tackles'] * 0.5 + 
         eff_df.loc[def_mask, 'interceptions'] * 0.5 + 
         eff_df.loc[def_mask, 'clearances'] * 0.3 +
         eff_df.loc[def_mask, 'clean_sheets'] * 2) / (eff_df.loc[def_mask, 'minutes'] / 90),
        0
    )
    
    # Midfielder efficiency
    mid_mask = eff_df['element_type'] == 3
    eff_df.loc[mid_mask, 'mid_efficiency'] = np.where(
        eff_df.loc[mid_mask, 'minutes'] > 0,
        (eff_df.loc[mid_mask, 'key_passes'] * 1.0 + 
         eff_df.loc[mid_mask, 'assists'] * 3 + 
         eff_df.loc[mid_mask, 'goals_scored'] * 4 +
         eff_df.loc[mid_mask, 'dribbles_completed'] * 0.5) / (eff_df.loc[mid_mask, 'minutes'] / 90),
        0
    )
    
    # Forward efficiency
    fwd_mask = eff_df['element_type'] == 4
    eff_df.loc[fwd_mask, 'fwd_efficiency'] = np.where(
        eff_df.loc[fwd_mask, 'minutes'] > 0,
        (eff_df.loc[fwd_mask, 'goals_scored'] * 4 + 
         eff_df.loc[fwd_mask, 'assists'] * 3 + 
         eff_df.loc[fwd_mask, 'shots_on_target'] * 0.5) / (eff_df.loc[fwd_mask, 'minutes'] / 90),
        0
    )
    
    return eff_df

# Apply efficiency metrics
enhanced_df = create_efficiency_metrics(enhanced_df)

print(f"\n✅ Advanced efficiency metrics created!")
print(f"📊 Dataset now has {len(enhanced_df.columns)} total columns")

# Show newly created efficiency features
efficiency_features = [col for col in enhanced_df.columns if any(keyword in col for keyword in 
                      ['efficiency', 'per_90', 'accuracy', 'ratio', 'rate', 'involvement'])]

print(f"\n📋 Efficiency features created: {len(efficiency_features)}")
for feature in efficiency_features[:12]:  # Show first 12
    print(f"   • {feature}")

# Sample efficiency metrics for different positions
positions = {1: 'GKP', 2: 'DEF', 3: 'MID', 4: 'FWD'}
print(f"\n📈 Sample efficiency metrics by position:")

for pos_code, pos_name in positions.items():
    pos_data = enhanced_df[enhanced_df['element_type'] == pos_code]
    if len(pos_data) > 0:
        sample = pos_data[pos_data['minutes'] > 0].iloc[0]
        print(f"\n{pos_name} - {sample['web_name']}:")
        print(f"   • Price Performance: {sample['price_performance_ratio']:.2f}")
        print(f"   • Minutes per Point: {sample['minutes_per_point']:.1f}")
        if pos_code == 1 and 'gk_efficiency' in sample.index:
            print(f"   • GK Efficiency: {sample['gk_efficiency']:.2f}")
        elif pos_code == 2 and 'def_efficiency' in sample.index:
            print(f"   • DEF Efficiency: {sample['def_efficiency']:.2f}")
        elif pos_code == 3 and 'mid_efficiency' in sample.index:
            print(f"   • MID Efficiency: {sample['mid_efficiency']:.2f}")
        elif pos_code == 4 and 'fwd_efficiency' in sample.index:
            print(f"   • FWD Efficiency: {sample['fwd_efficiency']:.2f}")

⚡ Creating advanced efficiency metrics...
🎯 Calculating shooting efficiency...
🏃 Calculating per-90 minute metrics...
🎲 Calculating involvement metrics...
📊 Calculating advanced ratios...
💎 Calculating quality metrics...
🎪 Calculating position-specific efficiency...

✅ Advanced efficiency metrics created!
📊 Dataset now has 94 total columns

📋 Efficiency features created: 27
   • clean_sheet_rate
   • goals_conceded_per_90
   • save_efficiency
   • defensive_actions_per_90
   • aerial_success_rate
   • pass_completion_rate
   • passes_per_90
   • dribble_success_rate
   • goal_involvement
   • shots_per_90
   • shot_conversion_rate
   • shots_on_target_rate

📈 Sample efficiency metrics by position:

GKP - Raya:
   • Price Performance: 0.91
   • Minutes per Point: 18.0
   • GK Efficiency: 0.60

DEF - Gabriel:
   • Price Performance: 0.66
   • Minutes per Point: 22.5
   • DEF Efficiency: 2.50

MID - Saka:
   • Price Performance: 0.10
   • Minutes per Point: 35.0
   • MID Efficiency: 9.00


## 5. Contextual Features

In [11]:
# Feature Selection and Validation for 2025/26 Modeling
print("🔍 Performing feature selection and validation...")

def analyze_feature_importance(df, target_col='total_points'):
    """
    Analyze feature importance for predicting player performance
    """
    print(f"🎯 Analyzing features for predicting: {target_col}")
    
    # Prepare data for feature selection
    feature_df = df.copy()
    
    # Remove non-numeric and identifier columns
    exclude_cols = ['web_name', 'team', 'player_id', 'gameweek']
    numeric_cols = feature_df.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in numeric_cols if col not in exclude_cols and col != target_col]
    
    print(f"📊 Total features to analyze: {len(feature_cols)}")
    
    # Prepare features and target
    X = feature_df[feature_cols].fillna(0)
    y = feature_df[target_col].fillna(0)
    
    # Remove features with zero variance
    from sklearn.feature_selection import VarianceThreshold
    variance_selector = VarianceThreshold(threshold=0.01)
    X_filtered = variance_selector.fit_transform(X)
    selected_features = [feature_cols[i] for i in range(len(feature_cols)) if variance_selector.variances_[i] > 0.01]
    
    print(f"📈 Features after variance filter: {len(selected_features)}")
    
    # Feature importance using mutual information
    from sklearn.feature_selection import mutual_info_regression
    mi_scores = mutual_info_regression(X_filtered, y, random_state=42)
    mi_results = pd.DataFrame({
        'feature': selected_features,
        'importance': mi_scores
    }).sort_values('importance', ascending=False)
    
    print(f"\n🏆 Top 15 most important features:")
    for i, (_, row) in enumerate(mi_results.head(15).iterrows(), 1):
        print(f"   {i:2d}. {row['feature']:<25} - {row['importance']:.4f}")
    
    return mi_results, selected_features, X_filtered, y

# Analyze feature importance
feature_importance, selected_features, X_processed, y_target = analyze_feature_importance(enhanced_df)

print(f"\n🔬 Feature correlation analysis...")

# Analyze correlation between top features
top_features = feature_importance.head(20)['feature'].tolist()
correlation_matrix = enhanced_df[top_features + ['total_points']].corr()

# Show highest correlations with target
target_correlations = correlation_matrix['total_points'].drop('total_points').sort_values(key=abs, ascending=False)

print(f"📊 Features most correlated with total_points:")
for i, (feature, corr) in enumerate(target_correlations.head(10).items(), 1):
    print(f"   {i:2d}. {feature:<25} - {corr:+.4f}")

print(f"\n🎯 Position-specific feature analysis...")

# Analyze feature importance by position
positions = {1: 'GKP', 2: 'DEF', 3: 'MID', 4: 'FWD'}

position_insights = {}
for pos_code, pos_name in positions.items():
    pos_data = enhanced_df[enhanced_df['element_type'] == pos_code]
    if len(pos_data) > 50:  # Minimum sample size
        pos_importance, _, _, _ = analyze_feature_importance(pos_data)
        position_insights[pos_name] = pos_importance.head(5)
        
        print(f"\n{pos_name} - Top 5 features:")
        for i, (_, row) in enumerate(pos_importance.head(5).iterrows(), 1):
            print(f"   {i}. {row['feature']:<20} - {row['importance']:.4f}")

print(f"\n📊 Feature engineering summary:")
print(f"✅ Original columns: {len(players_df.columns)}")
print(f"✅ Enhanced columns: {len(enhanced_df.columns)}")
print(f"✅ New features created: {len(enhanced_df.columns) - len(players_df.columns)}")

# Feature categories summary
categories = {
    'Position-specific': [col for col in enhanced_df.columns if any(keyword in col for keyword in 
                         ['gk_', 'def_', 'mid_', 'fwd_', 'save_', 'defensive_actions', 'creativity_', 'shot_conversion'])],
    'Form indicators': [col for col in enhanced_df.columns if any(keyword in col for keyword in 
                       ['form', 'momentum', 'consistency'])],
    'Efficiency metrics': [col for col in enhanced_df.columns if any(keyword in col for keyword in 
                          ['per_90', 'efficiency', 'rate', 'accuracy', 'ratio'])],
    'Value metrics': [col for col in enhanced_df.columns if any(keyword in col for keyword in 
                     ['per_million', 'value', 'price_performance'])]
}

print(f"\n📋 Feature categories created:")
for category, features in categories.items():
    print(f"   • {category}: {len(features)} features")

print(f"\n💾 Saving enhanced dataset...")

# Save the enhanced dataset for modeling
enhanced_df.to_csv('../../data/enhanced_fpl_features.csv', index=False)
print(f"✅ Enhanced dataset saved to: data/enhanced_fpl_features.csv")

print(f"\n🎯 Ready for 2025/26 season modeling!")
print(f"📊 Dataset prepared with {len(enhanced_df)} records and {len(enhanced_df.columns)} features")
print(f"🚀 Next step: Build predictive models using these features")

🔍 Performing feature selection and validation...
🎯 Analyzing features for predicting: total_points
📊 Total features to analyze: 89
📈 Features after variance filter: 82

🏆 Top 15 most important features:
    1. price_performance_ratio   - 1.7294
    2. points_per_million        - 1.7148
    3. points_form_3gw           - 1.3735
    4. points_form_6gw           - 1.3731
    5. points_form_4gw           - 1.3675
    6. form_value_3gw            - 1.2141
    7. minutes_per_point         - 1.0559
    8. recent_vs_avg             - 0.9842
    9. points_per_minute         - 0.8455
   10. points_consistency        - 0.8393
   11. points_momentum           - 0.7580
   12. minutes                   - 0.6439
   13. minutes_avg_3gw           - 0.6413
   14. minutes_avg_4gw           - 0.6334
   15. minutes_avg_6gw           - 0.6256

🔬 Feature correlation analysis...
📊 Features most correlated with total_points:
    1. points_form_3gw           - +0.9891
    2. points_form_6gw           - +0.9850


## 6. Feature Selection and Importance

In [12]:
def select_important_features(X, y, method='mutual_info', k=20):
    """
    Select most important features for prediction
    """
    if method == 'mutual_info':
        selector = SelectKBest(score_func=mutual_info_regression, k=k)
    else:
        selector = SelectKBest(score_func=f_regression, k=k)
    
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    feature_scores = selector.scores_[selector.get_support()]
    
    return X_selected, selected_features, feature_scores

def plot_feature_importance(features, scores, title="Feature Importance"):
    """
    Plot feature importance scores
    """
    feature_df = pd.DataFrame({
        'feature': features,
        'importance': scores
    }).sort_values('importance', ascending=True)
    
    plt.figure(figsize=(10, 8))
    plt.barh(feature_df['feature'], feature_df['importance'])
    plt.title(title)
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()
    
    return feature_df

# TODO: Apply feature selection
# numeric_features = player_data.select_dtypes(include=[np.number]).columns
# X = player_data[numeric_features].fillna(0)
# y = player_data['next_match_rating']  # Target variable
# 
# X_selected, selected_features, scores = select_important_features(X, y)
# feature_importance_df = plot_feature_importance(selected_features, scores)