In [None]:
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [None]:
username = "root"
password = "Sp1d3rman"
host = "localhost"
port = "3306"
database = "nhl_optimizer"

engine = create_engine(f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}")

In [None]:
query = "SELECT * FROM player_data"

df = pd.read_sql(query,engine)
print(df.columns)

In [None]:
# features_to_exclude = [
#      'SH%','Rebounds Created/60','Faceoffs %'
# ]

features_to_exclude = [
    'SH%', 'iCF/60',
     'iSCF/60','Rebounds Created/60','Faceoffs %'
]

In [None]:
X = df.drop(columns=['Player','Goals/60', 'Total Assists/60', 'First Assists/60', 'Second Assists/60', 'Total Points/60'] + features_to_exclude )

y = df['Goals/60']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['Position']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a pipeline
goals_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Define hyperparameters for tuning
param_dist = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.7, 0.8, 0.9],
    'regressor__colsample_bytree': [0.7, 0.8, 0.9],
    'regressor__reg_alpha': [0, 0.1, 1],
    'regressor__reg_lambda': [1, 1.5, 2]
}

# Randomized search
random_search_xgb = RandomizedSearchCV(
    goals_pipeline, param_distributions=param_dist,
    n_iter=50, cv=5, scoring='neg_mean_absolute_error',
    random_state=42, n_jobs=-1
)
random_search_xgb.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters: {random_search_xgb.best_params_}")
print(f"Best MAE: {-random_search_xgb.best_score_}")

# Evaluate on test set
y_pred_xgb = random_search_xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Test Set MAE: {mae_xgb}")
print(f"XGBoost Test Set R^2: {r2_xgb}")

In [None]:
# Save the pipeline
joblib.dump(random_search_xgb.best_estimator_, 'goals_model.pkl')