In [31]:
from PFR_Scraper_fantasy_teams import DataScraper 
from PFR_Processor_fantasy_teams import DataPreprocessor
from PFR_Merger_fantasy_teams import MergeAndProcess
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from scikeras.wrappers import KerasRegressor
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
years = list(range(2010, 2023))

In [3]:
scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()

In [4]:
player_preprocessor = DataPreprocessor(player_data)
player_data = player_preprocessor.preprocess_data()
team_preprocessor = DataPreprocessor(team_data)
team_data = team_preprocessor.preprocess_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[cols_to_fill] = self.data[cols_to_fill].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.dropna(axis=0, thresh=len(self.data.columns) * thresh, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.dropna(subset=['PPR'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [32]:
# create instance of MergeAndProcess class
merger = MergeAndProcess(player_data, team_data)

# call process_data method to merge and process data
merged_data = merger.process()

  merged_data['Player'] = merged_data['Player'].str.replace(r'[^\w\s]+', '')


In [52]:
# Get unique positions from the 'FantPos' column
positions = merged_data['FantPos'].unique()

# Create an empty dictionary to store the models for each position
models = {}

# Define the hyperparameters to tune
param_grid = {
    'batch_size': [32, 64, 128],
    'epochs': [50, 100, 200],
    'optimizer': ['adam', 'rmsprop'],
    'dropout_rate': [0.1, 0.2, 0.3]
}

# Define the number of features to select using RFE
n_features = 20

# Define the number of folds for cross-validation
n_folds = 5

In [53]:
# Define the create_model() function to create a Keras model with the specified hyperparameters
def create_model(neurons=64, dropout_rate=0.1, optimizer='adam'):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(neurons, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(neurons // 2, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

In [55]:
# Loop through each position and create a model for that position
for position in positions:
    print('Creating model for position:', position)
    
    # Filter data for training (< 2020) and for the current position
    train_data = merged_data[(merged_data['Year'] < 2020) & (merged_data['FantPos'] == position)]

    # Filter data for testing (< 2022 and not in training data) and for the current position
    test_data = merged_data[(merged_data['Year'] < 2022) & (~merged_data.index.isin(train_data.index)) & (merged_data['FantPos'] == position)]

    # Select features (X) and the target variable (y) for training data
    X_train = train_data.drop("next_year_PPR", axis=1)
    X_train = X_train.select_dtypes(include=[np.number])  # Select only numerical columns
    y_train = train_data["next_year_PPR"]

    # Select features (X) and the target variable (y) for testing data
    X_test = test_data.drop("next_year_PPR", axis=1)
    X_test = X_test.select_dtypes(include=[np.number])  # Select only numerical columns
    y_test = test_data["next_year_PPR"]

    # Create a linear regression object
    lin_reg = LinearRegression()

    # Create the RFE object and specify the number of features to select
    rfe = RFE(lin_reg, n_features_to_select=n_features)

    # Fit the RFE object to the training data
    rfe.fit(X_train, y_train)

    # Get the selected feature indices
    feature_indices = rfe.get_support(indices=True)

    # Select the features for training and testing data
    X_train = X_train.iloc[:, feature_indices]
    X_test = X_test.iloc[:, feature_indices]

    # Convert the data to NumPy arrays with the correct data type
    X_train = np.array(X_train, dtype=np.float32)
    y_train = np.array(y_train, dtype=np.float32)
    X_test = np.array(X_test, dtype=np.float32)
    y_test = np.array(y_test, dtype=np.float32)

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create a KerasRegressor with the current model architecture
    model = KerasRegressor(model=create_model, dropout_rate=0.1, verbose=0)

    # Create a GridSearchCV object with the hyperparameters to tune and the cross-validation folds
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_folds, n_jobs=-1, scoring='neg_mean_squared_error')

    # Fit the GridSearchCV object to the training data
    grid_result = grid.fit(X_train_scaled, y_train)

    # Get the best hyperparameters and the corresponding mean squared error score
    best_params = grid_result.best_params_
    best_score = -grid_result.best_score_

    print("Best Parameters:", best_params)
    print("Best MSE:", best_score)

    # Create the final model with the best hyperparameters and fit it to the training data
    final_model = create_model(dropout_rate=best_params['dropout_rate'], optimizer=best_params['optimizer'])
    final_model.fit(X_train_scaled, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)

    # Evaluate the final model on the test data
    mse, mae = final_model.evaluate(X_test_scaled, y_test, verbose=0)
    print("Mean Squared Error for test data:", mse)
    print("Mean Absolute Error:", mae)

    # Add the final model to the dictionary of models
    models[position] = final_model

Creating model for position: RB


2023-05-01 14:02:37.628232: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-01 14:02:37.628232: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-01 14:02:37.628232: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the ap

Best Parameters: {'batch_size': 32, 'dropout_rate': 0.3, 'epochs': 100, 'optimizer': 'rmsprop'}
Best MSE: 5621.99384765625
Mean Squared Error for test data: 5658.3740234375
Mean Absolute Error: 59.46014404296875
Creating model for position: WR
Best Parameters: {'batch_size': 128, 'dropout_rate': 0.3, 'epochs': 200, 'optimizer': 'rmsprop'}
Best MSE: 4938.28134765625
Mean Squared Error for test data: 4518.283203125
Mean Absolute Error: 50.95571517944336
Creating model for position: QB
Best Parameters: {'batch_size': 64, 'dropout_rate': 0.1, 'epochs': 200, 'optimizer': 'rmsprop'}
Best MSE: 8304.86943359375
Mean Squared Error for test data: 8176.53759765625
Mean Absolute Error: 76.74531555175781
Creating model for position: TE


KeyboardInterrupt: 

In [None]:
# iterate over the models dictionary
for model_name, model in models.items():
    # Make predictions on the test data
    y_pred = model.predict(X_test_scaled)

    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # Print the r2 score for each model
    print(f"R-squared for {model_name}: {r2}")

In [20]:
# Filter data for training (< 2020)
train_data = merged_data[merged_data['Year'] < 2020]

# Filter data for testing (< 2022 and not in training data)
test_data = merged_data[(merged_data['Year'] < 2022) & (~merged_data.index.isin(train_data.index))]

# Select features (X) and the target variable (y) for training data
X_train = train_data.drop("next_year_PPR", axis=1)
X_train = X_train.select_dtypes(include=[np.number])  # Select only numerical columns
y_train = train_data["next_year_PPR"]

# Select features (X) and the target variable (y) for testing data
X_test = test_data.drop("next_year_PPR", axis=1)
X_test = X_test.select_dtypes(include=[np.number])  # Select only numerical columns
y_test = test_data["next_year_PPR"]

# Convert the data to NumPy arrays with the correct data type
#X_train = np.array(X_train, dtype=np.float32)
#y_train = np.array(y_train, dtype=np.float32)
#X_test = np.array(X_test, dtype=np.float32)
#y_test = np.array(y_test, dtype=np.float32)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model


model = KerasRegressor(create_model, epochs=100, batch_size=32, verbose=0)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and calculate the mean squared error
mse_scores = -cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='neg_mean_squared_error')

print("Mean Squared Error for each fold:", mse_scores)
print("Mean Squared Error (avg):", mse_scores.mean())


final_model = create_model()
final_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=0)


mse, mae = final_model.evaluate(X_test_scaled, y_test, verbose=0)
print("Mean Squared Error on test data:", mse)
print("Mean Absolute Error on test data:", mae)


In [17]:
from sklearn.metrics import r2_score

# Make predictions on the test data
y_pred = final_model.predict(X_test_scaled)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("R-squared:", r2)


R-squared: 0.3884727734783716


In [21]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train_scaled, i) for i in range(X_train_scaled.shape[1])]
vif["feature"] = X_train.columns

print(vif)

    VIF Factor       feature
0     1.047158          Year
1   382.393829           Cmp
2   357.898316   Passing_Att
3   235.165943   Passing_Yds
4    26.368423    Passing_TD
5     9.599703           Int
6     1.050647           Y/A
7     2.341179    Rushing_TD
8     1.800879           Y/R
9     2.497346  Receiving_TD
10    5.272256           Fmb
11    3.466759            FL
12    1.059803           2PM
13    1.477389           2PP
14    2.597641           VBD
15    1.300854        OvRank
16    1.018231        Team_T
