In [42]:
import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
import joblib

def create_db_connection():
    username = "root"
    password = "Sp1d3rman"
    host = "localhost"
    port = "3306"
    database = "nhl_optimizer"
    return create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

def load_data():
    engine = create_db_connection()
    query = "SELECT * FROM player_data"
    return pd.read_sql(query, engine)

def engineer_features(df):
    """Create advanced features with position-specific ratios and interactions"""
    df_eng = df.copy()
    
    # Basic efficiency metrics
    df_eng['Shot_Quality'] = df_eng['ixG/60'] / df_eng['Shots/60'].clip(lower=0.1)
    df_eng['High_Danger_Ratio'] = df_eng['iHDCF/60'] / df_eng['iFF/60'].clip(lower=0.1)
    df_eng['Scoring_Chances_per_Shot'] = df_eng['iSCF/60'] / df_eng['Shots/60'].clip(lower=0.1)
    
    # Position-specific metrics
    for pos in df_eng['Position'].unique():
        pos_mask = df_eng['Position'] == pos
        pos_mean_shots = df_eng.loc[pos_mask, 'Shots/60'].mean()
        pos_mean_ixg = df_eng.loc[pos_mask, 'ixG/60'].mean()
        
        # Relative to position metrics
        df_eng.loc[pos_mask, 'Shots_vs_Pos'] = df_eng.loc[pos_mask, 'Shots/60'] / pos_mean_shots
        df_eng.loc[pos_mask, 'ixG_vs_Pos'] = df_eng.loc[pos_mask, 'ixG/60'] / pos_mean_ixg
    
    # Advanced opportunity metrics
    df_eng['Offensive_Zone_Time'] = df_eng['iFF/60'] / df_eng['TOI/GP']
    df_eng['Rush_to_Cycle_Ratio'] = df_eng['Rush Attempts/60'] / df_eng['iSCF/60'].clip(lower=0.1)
    df_eng['Rebounds_per_Shot'] = df_eng['Rebounds Created/60'] / df_eng['Shots/60'].clip(lower=0.1)
    
    # Physical play and possession
    df_eng['Physical_Impact'] = df_eng['Hits/60'] + df_eng['Shots Blocked/60']
    df_eng['Net_Takeaways'] = df_eng['Takeaways/60'] - df_eng['Giveaways/60']
    df_eng['Possession_Score'] = df_eng['iFF/60'] + (2 * df_eng['Net_Takeaways'])
    
    # Ice time quality metrics
    df_eng['PP_Involvement'] = df_eng['IPP'].fillna(df_eng['IPP'].mean())
    df_eng['TOI_Quality'] = df_eng['TOI/GP'] * df_eng['PP_Involvement']
    
    # Interaction terms between top features
    df_eng['Shot_Quality_IPP'] = df_eng['Shot_Quality'] * df_eng['PP_Involvement']
    df_eng['Danger_IPP'] = df_eng['High_Danger_Ratio'] * df_eng['PP_Involvement']
    
    return df_eng

def prepare_data(df):
    """Prepare data with feature selection"""
    df_processed = df.copy()
    
    # Engineer features first
    df_processed = engineer_features(df_processed)
    
    # Drop non-predictive columns
    columns_to_drop = [
        'Player', 'Team', 'Date',
        'Goals/60', 'Total Points/60', 'Total Assists/60',
        'First Assists/60', 'Second Assists/60',
        'SH%', 'Faceoffs %'
    ]
    
    df_processed = df_processed.drop(columns=columns_to_drop)
    
    # Handle Position
    df_processed['Position'] = df_processed['Position'].astype('category')
    
    # Replace infinities
    df_processed = df_processed.replace([np.inf, -np.inf], np.nan)
    
    return df_processed

def create_model():
    """Create XGBoost model with optimized parameters"""
    return xgb.XGBRegressor(
        n_estimators=1000,  # More trees
        learning_rate=0.01,  # Lower learning rate
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1,
        random_state=42,
        early_stopping_rounds=50
    )

def main():
    print("Loading data...")
    df = load_data()
    
    print("\nPreparing data...")
    X = prepare_data(df)
    y = df['Goals/60']
    
    print("\nSplitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define features
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = ['Position']
    
    print(f"\nFeatures:")
    print(f"Numeric: {len(numeric_features)}")
    print(f"Categorical: {len(categorical_features)}")
    
    # Create preprocessing pipeline
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler(quantile_range=(5, 95)))
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    # Process data
    print("\nPreprocessing data...")
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Feature selection
    print("\nPerforming feature selection...")
    selector = SelectFromModel(
        xgb.XGBRegressor(n_estimators=100, random_state=42),
        prefit=False,
        threshold='median'
    )
    X_train_selected = selector.fit_transform(X_train_processed, y_train)
    X_test_selected = selector.transform(X_test_processed)
    
    # Train model
    print("\nTraining model...")
    model = create_model()
    
    model.fit(
        X_train_selected, 
        y_train,
        eval_set=[(X_test_selected, y_test)],
        verbose=True
    )
    
    # Predictions
    print("\nMaking predictions...")
    y_pred = model.predict(X_test_selected)
    
    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nTest MAE: {mae:.4f}")
    print(f"Test R2: {r2:.4f}")
    
    # Save components
    print("\nSaving model components...")
    joblib.dump(preprocessor, 'goals_preprocessor_v3.pkl')
    joblib.dump(selector, 'goals_selector_v3.pkl')
    joblib.dump(model, 'goals_model_v3.pkl')
    
    # Feature importance
    print("\nCalculating feature importance...")
    feature_names = (numeric_features + 
                    [f"{categorical_features[0]}_{cat}" for cat in 
                     preprocessor.named_transformers_['cat'].named_steps['encoder'].categories_[0][1:]])
    
    selected_mask = selector.get_support()
    selected_features = np.array(feature_names)[selected_mask]
    importance_dict = dict(zip(selected_features, model.feature_importances_))
    sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
    
    print("\nTop 20 Most Important Features:")
    for feature, importance in sorted_importance[:20]:
        print(f"{feature}: {importance:.4f}")

if __name__ == "__main__":
    main()

Loading data...

Preparing data...

Splitting data...

Features:
Numeric: 40
Categorical: 1

Preprocessing data...

Performing feature selection...

Training model...
[0]	validation_0-rmse:1.44360
[1]	validation_0-rmse:1.43655
[2]	validation_0-rmse:1.42965
[3]	validation_0-rmse:1.42390
[4]	validation_0-rmse:1.41714
[5]	validation_0-rmse:1.41053
[6]	validation_0-rmse:1.40420
[7]	validation_0-rmse:1.39795
[8]	validation_0-rmse:1.39237
[9]	validation_0-rmse:1.38602
[10]	validation_0-rmse:1.37959
[11]	validation_0-rmse:1.37422
[12]	validation_0-rmse:1.36841
[13]	validation_0-rmse:1.36330
[14]	validation_0-rmse:1.35741
[15]	validation_0-rmse:1.35155
[16]	validation_0-rmse:1.34665
[17]	validation_0-rmse:1.34168
[18]	validation_0-rmse:1.33632
[19]	validation_0-rmse:1.33094
[20]	validation_0-rmse:1.32557
[21]	validation_0-rmse:1.32030
[22]	validation_0-rmse:1.31495
[23]	validation_0-rmse:1.30955
[24]	validation_0-rmse:1.30512
[25]	validation_0-rmse:1.30018
[26]	validation_0-rmse:1.29541
[27]	v

In [46]:
import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, RegressorMixin
import joblib

def create_db_connection():
    username = "root"
    password = "Sp1d3rman"
    host = "localhost"
    port = "3306"
    database = "nhl_optimizer"
    return create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

def load_data():
    engine = create_db_connection()
    query = "SELECT * FROM player_data"
    return pd.read_sql(query, engine)

def engineer_features(df):
    """Create advanced features with position-specific ratios and interactions"""
    df_eng = df.copy()
    
    # Basic efficiency metrics
    df_eng['Shot_Quality'] = df_eng['ixG/60'] / df_eng['Shots/60'].clip(lower=0.1)
    df_eng['High_Danger_Ratio'] = df_eng['iHDCF/60'] / df_eng['iFF/60'].clip(lower=0.1)
    df_eng['Scoring_Chances_per_Shot'] = df_eng['iSCF/60'] / df_eng['Shots/60'].clip(lower=0.1)
    
    # Position-specific metrics
    for pos in df_eng['Position'].unique():
        pos_mask = df_eng['Position'] == pos
        pos_mean_shots = df_eng.loc[pos_mask, 'Shots/60'].mean()
        pos_mean_ixg = df_eng.loc[pos_mask, 'ixG/60'].mean()
        
        # Relative to position metrics
        df_eng.loc[pos_mask, 'Shots_vs_Pos'] = df_eng.loc[pos_mask, 'Shots/60'] / pos_mean_shots
        df_eng.loc[pos_mask, 'ixG_vs_Pos'] = df_eng.loc[pos_mask, 'ixG/60'] / pos_mean_ixg
    
    # Advanced opportunity metrics
    df_eng['Offensive_Zone_Time'] = df_eng['iFF/60'] / df_eng['TOI/GP']
    df_eng['Rush_to_Cycle_Ratio'] = df_eng['Rush Attempts/60'] / df_eng['iSCF/60'].clip(lower=0.1)
    df_eng['Rebounds_per_Shot'] = df_eng['Rebounds Created/60'] / df_eng['Shots/60'].clip(lower=0.1)
    
    # Physical play and possession
    df_eng['Physical_Impact'] = df_eng['Hits/60'] + df_eng['Shots Blocked/60']
    df_eng['Net_Takeaways'] = df_eng['Takeaways/60'] - df_eng['Giveaways/60']
    df_eng['Possession_Score'] = df_eng['iFF/60'] + (2 * df_eng['Net_Takeaways'])
    
    # Ice time quality metrics
    df_eng['PP_Involvement'] = df_eng['IPP'].fillna(df_eng['IPP'].mean())
    df_eng['TOI_Quality'] = df_eng['TOI/GP'] * df_eng['PP_Involvement']
    
    return df_eng

def prepare_data(df):
    """Prepare data with enhanced feature engineering"""
    df_processed = df.copy()
    
    # Engineer features first
    df_processed = engineer_features(df_processed)
    
    # Drop non-predictive columns
    columns_to_drop = [
        'Player', 'Team', 'Date',
        'Goals/60', 'Total Points/60', 'Total Assists/60',
        'First Assists/60', 'Second Assists/60',
        'SH%', 'Faceoffs %'
    ]
    
    df_processed = df_processed.drop(columns=columns_to_drop)
    
    # Replace infinities
    df_processed = df_processed.replace([np.inf, -np.inf], np.nan)
    
    return df_processed

def create_preprocessor(numeric_features, categorical_features):
    """Create preprocessing pipeline"""
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler(quantile_range=(5, 95)))
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

def create_base_model():
    """Create XGBoost model with optimized parameters"""
    return xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1,
        random_state=42
    )

class StackedHockeyModel(BaseEstimator, RegressorMixin):
    """Stacked model combining specialized predictors for different player types"""
    def __init__(self, ipp_threshold=50):
        self.ipp_threshold = ipp_threshold
        self.high_ipp_model = None
        self.low_ipp_model = None
        self.preprocessor = None
        self.numeric_features = None
        self.categorical_features = None
    
    def fit(self, X, y):
        # Store feature names
        self.numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        self.categorical_features = ['Position']
        
        # Create preprocessor
        self.preprocessor = create_preprocessor(self.numeric_features, self.categorical_features)
        
        # Transform data
        X_processed = self.preprocessor.fit_transform(X)
        
        # Split data based on IPP
        high_ipp_mask = X['IPP'] >= self.ipp_threshold
        
        # Create and fit models
        self.high_ipp_model = create_base_model()
        self.low_ipp_model = create_base_model()
        
        # Convert X_processed to numpy array if it isn't already
        X_processed = np.asarray(X_processed)
        
        # Fit models using transformed data and boolean indexing
        if high_ipp_mask.any():
            self.high_ipp_model.fit(
                X_processed[high_ipp_mask], 
                y.iloc[high_ipp_mask]
            )
        
        if (~high_ipp_mask).any():
            self.low_ipp_model.fit(
                X_processed[~high_ipp_mask], 
                y.iloc[~high_ipp_mask]
            )
        
        return self
    
    def predict(self, X):
        # Transform data
        X_processed = self.preprocessor.transform(X)
        X_processed = np.asarray(X_processed)
        
        # Split based on IPP
        high_ipp_mask = X['IPP'] >= self.ipp_threshold
        
        # Initialize predictions array
        predictions = np.zeros(len(X))
        
        # Make predictions using boolean indexing
        if high_ipp_mask.any():
            predictions[high_ipp_mask] = self.high_ipp_model.predict(X_processed[high_ipp_mask])
        if (~high_ipp_mask).any():
            predictions[~high_ipp_mask] = self.low_ipp_model.predict(X_processed[~high_ipp_mask])
        
        return predictions

def main():
    print("Loading data...")
    df = load_data()
    
    print("\nPreparing data...")
    X = prepare_data(df)
    y = df['Goals/60']
    
    print("\nSplitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    print("\nTraining model...")
    model = StackedHockeyModel(ipp_threshold=50)
    model.fit(X_train, y_train)
    
    # Make predictions
    print("\nMaking predictions...")
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nTest MAE: {mae:.4f}")
    print(f"Test R2: {r2:.4f}")
    
    # Calculate metrics by position
    for pos in X_test['Position'].unique():
        pos_mask = X_test['Position'] == pos
        pos_mae = mean_absolute_error(y_test[pos_mask], y_pred[pos_mask])
        pos_r2 = r2_score(y_test[pos_mask], y_pred[pos_mask])
        print(f"\n{pos} Test MAE: {pos_mae:.4f}")
        print(f"{pos} Test R2: {pos_r2:.4f}")
    
    # Save model components
    print("\nSaving model...")
    joblib.dump(model, 'goals_model_v4.pkl')
    
    # Feature importance
    print("\nFeature Importance:")
    
    # Get feature names after preprocessing
    feature_names = (model.numeric_features + 
                    [f"Position_{cat}" for cat in 
                     model.preprocessor.named_transformers_['cat'].named_steps['onehot'].categories_[0][1:]])
    
    print("\nHigh IPP Model - Top 15 Features:")
    high_importance = dict(zip(feature_names, model.high_ipp_model.feature_importances_))
    for feat, imp in sorted(high_importance.items(), key=lambda x: x[1], reverse=True)[:15]:
        print(f"{feat}: {imp:.4f}")
    
    print("\nLow IPP Model - Top 15 Features:")
    low_importance = dict(zip(feature_names, model.low_ipp_model.feature_importances_))
    for feat, imp in sorted(low_importance.items(), key=lambda x: x[1], reverse=True)[:15]:
        print(f"{feat}: {imp:.4f}")

if __name__ == "__main__":
    main()

Loading data...

Preparing data...

Splitting data...

Training model...


NotImplementedError: iLocation based boolean indexing on an integer type is not available