In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import joblib
import os

In [3]:
defense = pd.read_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/linear_reg_csv_files/defense_lr.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/linear_reg_csv_files/defense_lr.csv'

In [None]:
defense.head(50)

In [None]:
# Display basic information about the DataFrame
goalies.info()

# Display summary statistics
goalies.describe()

In [None]:
# Remove NaN from salary and predict on them after the model is built
goalies = goalies.dropna(subset=['CAP HIT'])

In [None]:
# Change CAP HIT to a float and name is salary
goalies = goalies.rename(columns={'CAP HIT': 'salary'})

# Strip the $
goalies['salary'] = goalies['salary'].str.replace('$', '').str.replace(',', '').astype(float)


In [None]:
goalies.describe()

In [None]:
goalies = goalies.drop(columns=['Unnamed: 0',])

In [None]:
games_played = goalies.groupby(by=['GP', 'name'], ).sum()
# Sort the resulting DataFrame by 'GP' in descending order
games_played_sorted = games_played.sort_values(by='GP', ascending=True)

games_played.head()

In [None]:
# Define the function to assign salary based on games played
def assign_salary(gp):
    if gp <= 50:
        return 900000
    elif gp <= 100:
        return 2000000
    elif gp <= 150:
        return 3000000
    elif gp <= 200:
        return 5000000
    elif gp <= 300:
        return 8000000
    elif gp <= 400:
        return 9000000
    elif gp > 401:
        return 10000000
    else:
        return 0  # Default value if GP is greater than 200

# Apply the function to the 'GP' column to create the 'salary' column
goalies['salary'] = goalies['GP'].apply(assign_salary)
goalies.head(50)

In [None]:
# Plot histograms for numerical features
goalies.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot for Salary vs other numerical features
features = ['GP', 'W', 'L', 'GA',
        'SOG', 'SO', 'TIME', 'salary']

# Create a grid of subplots
num_features = len(features)
num_cols = 3
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each feature
for i, feature in enumerate(features):
    sns.scatterplot(x=goalies[feature], y=goalies['salary'], ax=axes[i])
    axes[i].set_title(f'Salary vs {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Salary')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
time = goalies[goalies['TIME'] > 35000]
time

In [None]:
# Define the features to plot
features = goalies.columns.drop('salary')  # Exclude the target variable 'salary'

# Create a grid of subplots
num_features = len(features)
num_cols = 3
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each feature
for i, feature in enumerate(features):
    sns.boxplot(y=goalies[feature], ax=axes[i])
    axes[i].set_title(f'Box Plot of {feature}')
    axes[i].set_xlabel(feature)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
goalies = goalies.drop(columns=['name'])
# Define the features and target
features = goalies.columns.drop('salary')
X = goalies[features]
y = goalies['salary']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# # Perform PCA
# pca = PCA(n_components=0.95)  # Retain 95% of the variance
# X_pca = pca.fit_transform(X_scaled)

# # Explained variance ratio
# explained_variance = pca.explained_variance_ratio_
# print("Explained variance ratio:", explained_variance)

# # Cumulative explained variance
# cumulative_explained_variance = np.cumsum(explained_variance)
# print("Cumulative explained variance:", cumulative_explained_variance)

In [None]:
# # Plot explained variance ratio
# plt.figure(figsize=(10, 6))
# plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center', label='Individual explained variance')
# plt.step(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, where='mid', label='Cumulative explained variance')
# plt.ylabel('Explained variance ratio')
# plt.xlabel('Principal components')
# plt.legend(loc='best')
# plt.title('Explained Variance Ratio by Principal Components')
# plt.show()

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f'Linear Regression MSE: {mse_lr}')
print(f'Linear Regression R2: {r2_lr}')

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f'Random Forest MSE: {mse_rf}')
print(f'Random Forest R2: {r2_rf}')

# XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'XGBoost MSE: {mse_xgb}')
print(f'XGBoost R2: {r2_xgb}')

# Compare Model Performance
models = ['Linear Regression', 'Random Forest', 'XGBoost']
mse_scores = [mse_lr, mse_rf, mse_xgb]
r2_scores = [r2_lr, r2_rf, r2_xgb]
performance_df = pd.DataFrame({'Model': models, 'MSE': mse_scores, 'R2': r2_scores})
print(performance_df)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

In [None]:
# Train XGBoost model with best parameters
best_xgb_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate XGBoost model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'XGBoost MSE: {mse_xgb}')
print(f'XGBoost R2: {r2_xgb}')

In [None]:
# Define the output directory
output_dir = '/Users/blairjdaniel/lighthouse/lighthouse/NHL/NHL_points_projection/models'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the model
model_path = os.path.join(output_dir, 'best_xgb_model.pkl')
joblib.dump(best_xgb_model, model_path)
print(f"Model saved to {model_path}")

In [None]:
defense_rec = pd.read_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/master_copies/defense_rec.csv')
defense_salary = pd.read_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/salary/cap_all.csv')

In [None]:
# Define the bins and corresponding salary values
bins = [0.0, 0.01, 0.03, 0.05, 0.1, 0.2, 0.3, float('inf')]
labels = [950000, 2000000, 4000000, 6000000, 8000000, 10000000, 12000000]

# Use pd.cut to bin the values and map them to the salary values
defense_rec['salary'] = pd.cut(defense_rec['goals'], bins=bins, labels=labels, right=False)

# Display the first few rows of the updated DataFrame
defense_rec.to_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/master_copies/forwards_rec_two.csv')

In [None]:
over_8000000 = defense_rec[defense_rec['games_played'] < 246]
over_8000000

In [None]:
# Define the over_8000000 DataFrame based on the criteria
over_8000000 = defense_rec[defense_rec['games_played'] < 246]

# Change the salary of all players in the over_8000000 DataFrame to $950,000 in the defense_rec DataFrame
defense_rec.loc[over_8000000.index, 'salary'] = 950000
defense_rec= defense_rec.drop(columns=['Unnamed: 0'])

In [None]:
defense_rec

In [None]:
defense_rec.to_csv('/Users/blairjdaniel/lighthouse/lighthouse/NHL/files/master_copies/defense_rec_two.csv')