In [17]:
from PFR_Scraper_fantasy_teams import DataScraper 
from PFR_Processor_fantasy_teams import DataPreprocessor
from PFR_Merger_fantasy_teams import MergeAndProcess
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from scikeras.wrappers import KerasRegressor
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from joblib import Parallel, delayed

In [2]:
years = list(range(2010, 2023))

In [3]:
scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()

In [4]:
player_preprocessor = DataPreprocessor(player_data)
player_data = player_preprocessor.preprocess_data()
team_preprocessor = DataPreprocessor(team_data)
team_data = team_preprocessor.preprocess_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[cols_to_fill] = self.data[cols_to_fill].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.dropna(axis=0, thresh=len(self.data.columns) * thresh, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data.dropna(subset=['PPR'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [5]:
# create instance of MergeAndProcess class
merger = MergeAndProcess(player_data, team_data)

# call process to merge and process data
merged_data = merger.process()

  merged_data['Player'] = merged_data['Player'].str.replace(r'[^\w\s]+', '')


In [6]:
# Standardize numerical data
scaler = StandardScaler()
num_data = merged_data.select_dtypes(include=[np.number])
scaled = scaler.fit_transform(num_data)
scaled_df = pd.DataFrame(scaled, columns=num_data.columns)

# Copy original to keep str data
merged_data_scaled = merged_data.copy()

# Replace num columns with standardized data, except for 'Year'
for column in scaled_df.columns:
    if column != 'Year':
        merged_data_scaled[column] = scaled_df[column]

In [7]:
merged_data_scaled

Unnamed: 0,Year,Tm,Rk,Player,FantPos,Age,G,GS,Cmp,Passing_Att,...,Team_PA,Team_PD,Team_MoV,Team_SoS,Team_SRS,Team_OSRS,Team_DSRS,Team_T,next_year_PPR,PPR_per_game
0,2010.0,HOU,1.0,Arian Foster,RB,24.0,16.0,13.0,-0.416551,-0.420348,...,427.0,-37.0,-2.3,0.5,-1.8,1.8,-3.7,-0.247170,305.1,24.5
1,2010.0,MIN,2.0,Adrian Peterson,RB,25.0,15.0,15.0,-0.416551,-0.420348,...,348.0,-67.0,-4.2,2.6,-1.6,-3.3,1.7,-0.247170,206.9,18.393333
2,2010.0,KAN,3.0,Jamaal Charles,RB,24.0,16.0,6.0,-0.416551,-0.420348,...,326.0,40.0,2.5,-3.2,-0.7,-1.5,0.8,-0.247170,239.5,17.65625
3,2010.0,CLE,4.0,Peyton Hillis,RB,24.0,16.0,14.0,-0.407504,-0.408783,...,332.0,-61.0,-3.8,2.3,-1.5,-3.4,2.0,-0.247170,109.7,18.43125
4,2010.0,TEN,5.0,Chris Johnson,RB,25.0,16.0,16.0,-0.416551,-0.420348,...,339.0,17.0,1.1,0.0,1.0,-0.9,1.9,-0.247170,225.5,17.05625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,2022.0,PHI,271.0,Gardner Minshew II,QB,26.0,5.0,2.0,-0.018468,0.019142,...,344.0,133.0,7.8,-1.3,6.5,5.8,0.7,-0.247170,,7.36
2636,2022.0,IND,272.0,Sam Ehlinger,QB,24.0,4.0,3.0,0.162479,0.163712,...,427.0,-138.0,-8.1,-0.5,-8.6,-4.9,-3.7,4.045791,,8.9
2637,2022.0,IND,273.0,Mo AlieCox,TE,29.0,17.0,11.0,-0.416551,-0.420348,...,427.0,-138.0,-8.1,-0.5,-8.6,-4.9,-3.7,4.045791,,3.170588
2638,2022.0,SFO,274.0,Kyle Juszczyk,RB,31.0,16.0,12.0,-0.416551,-0.420348,...,277.0,173.0,10.2,-2.3,7.9,3.3,4.6,-0.247170,,3.35


In [30]:
# Get pos from the 'FantPos' column
positions = merged_data_scaled['FantPos'].unique()

# Create empty dict to store models for each pos
models = {}

# Define hyperparameters to tune
param_grid = {
    'batch_size': [32, 64, 128],
    'epochs': [100, 200, 300],
    'optimizer': ['adam', 'rmsprop'],
    'dropout_rate': [0.1, 0.4, 0.6]
}

# Define num features to select w RFE
n_features = 15

# Define num folds for cv
n_folds = 5

In [31]:
#create_model() creates a Keras model with specified hyperparameters
def create_model(input_shape, neurons=64, dropout_rate=0.1, optimizer='adam'):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(neurons, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(neurons // 2, activation='relu'),
        tf.keras.layers.Dropout(dropout_rate),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

In [32]:
def create_and_evaluate_model(args):
    pos, data, param_grid, n_folds, n_features, kwargs = args
    print('Creating model for position:', pos)

    train_data = merged_data_scaled[(merged_data_scaled['Year'] < 2020) & (merged_data_scaled['FantPos'] == pos)]
    test_data = merged_data_scaled[(merged_data_scaled['Year'] < 2022) & (~merged_data_scaled.index.isin(train_data.index)) & (merged_data_scaled['FantPos'] == pos)]

    X_train = train_data.drop(['next_year_PPR', 'Year'], axis=1).select_dtypes(include=[np.number])
    y_train = train_data['next_year_PPR']
    X_test = test_data.drop(['next_year_PPR', 'Year'], axis=1).select_dtypes(include=[np.number])
    y_test = test_data['next_year_PPR']

    lin_reg = LinearRegression()
    rfe = RFE(lin_reg, n_features_to_select=n_features)
    rfe.fit(X_train, y_train)

    feature_index = rfe.get_support(indices=True)
    print(f'Selected features for {pos}: {feature_index}')

    X_train = X_train.iloc[:, feature_index]
    X_test = X_test.iloc[:, feature_index]

    model = KerasRegressor(
        model=create_model,
        input_shape=(n_features,),
        epochs=100,
        batch_size=10,
        verbose=0,
        **kwargs
    )
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_folds, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_result = grid.fit(X_train, y_train)

    best_params = grid_result.best_params_
    best_score = -grid_result.best_score_

    print("Best Parameters:", best_params)
    print("Best MSE:", best_score)

    final_model = create_model(input_shape=(X_train.shape[1],), dropout_rate=best_params['dropout_rate'], optimizer=best_params['optimizer'])
    final_model.fit(X_train, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)

    mse, mae = final_model.evaluate(X_test, y_test, verbose=0)
    print("Mean Squared Error for test data:", mse)
    print("Mean Absolute Error:", mae)

    return pos, final_model

In [36]:
from multiprocessing import Pool, cpu_count

# Parallelize the loop
n_jobs = 2  # -1 to use all cores

# Create a wrapper function to pass multiple arguments
def wrapper(args):
    return create_and_evaluate_model(*args)

# Prepare the arguments for the pool.map function
args_list = [(pos, merged_data_scaled, param_grid, n_folds, n_features, {'dropout_rate': 0.1}) for pos in positions]

# Use the multiprocessing Pool
with Pool(processes=n_jobs) as pool:
    results = pool.map(wrapper, args_list)

# Create a dictionary of models
models = dict(results)

Process SpawnPoolWorker-42:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'wrapper' on <module '__main__' (built-in)>
Process SpawnPoolWorker-41:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versi

KeyboardInterrupt: 

In [None]:
# Loop through positions and create model for each
for pos in positions:
    print('Creating model for position:', pos)
    
    # Filter for training (< 2020) and for current pos
    train_data = merged_data_scaled[(merged_data_scaled['Year'] < 2020) & (merged_data_scaled['FantPos'] == pos)]

    # Filter for testing (< 2022 and not in training) and for current pos
    test_data = merged_data_scaled[(merged_data_scaled['Year'] < 2022) & (~merged_data_scaled.index.isin(train_data.index)) & (merged_data_scaled['FantPos'] == pos)]

    # Select input and outputs
    X_train = train_data.drop(['next_year_PPR', 'Year'], axis=1)
    X_train = X_train.select_dtypes(include=[np.number])  
    y_train = train_data['next_year_PPR']

    X_test = test_data.drop(['next_year_PPR', 'Year'], axis=1)
    X_test = X_test.select_dtypes(include=[np.number]) 
    y_test = test_data['next_year_PPR']

    # linear regression object
    lin_reg = LinearRegression()

    # Create RFE and specify the num of features to select
    rfe = RFE(lin_reg, n_features_to_select=n_features)

    # Fit RFE to training data
    rfe.fit(X_train, y_train)

    # Get selected features
    feature_index = rfe.get_support(indices=True)
    print(f'Selected features for {pos}: {feature_index}')

    # Filter training and testing data
    X_train = X_train.iloc[:, feature_index]
    X_test = X_test.iloc[:, feature_index]

    # Convert w np.array
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # Create a KerasRegressor with the current model architecture
    model = KerasRegressor(model=create_model, dropout_rate=0.1, verbose=0)

    # Create a GridSearchCV object with the hyperparameters to tune and the cross-validation folds
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_folds, n_jobs=-1, scoring='neg_mean_squared_error')

    # Fit the GridSearchCV object to the training data
    grid_result = grid.fit(X_train, y_train)

    # Get the best hyperparameters and the corresponding mean squared error score
    best_params = grid_result.best_params_
    best_score = -grid_result.best_score_

    print("Best Parameters:", best_params)
    print("Best MSE:", best_score)

    # Create the final model with the best hyperparameters and fit it to the training data
    final_model = create_model(dropout_rate=best_params['dropout_rate'], optimizer=best_params['optimizer'])
    final_model.fit(X_train, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)

    # Evaluate the final model on the test data
    mse, mae = final_model.evaluate(X_test, y_test, verbose=0)
    print("Mean Squared Error for test data:", mse)
    print("Mean Absolute Error:", mae)

    # Add the final model to the dictionary of models
    models[pos] = final_model

In [None]:
# iterate over the models dictionary
for model_name, model in models.items():
    # Make predictions on the test data
    y_pred = model.predict(X_test_scaled)

    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)

    # Print the r2 score for each model
    print(f"R-squared for {model_name}: {r2}")

In [None]:
import matplotlib.pyplot as plt

# Define the figure and axes
fig, axs = plt.subplots(2, 2, figsize=(10, 8))
axs = axs.flatten()

# Loop through each position and make predictions on the test data
for i, (position, model) in enumerate(models.items()):

    # Make predictions on the test data
    y_pred = model.predict(X_test_scaled)

    # Plot the predicted values against the actual values
    axs[i].scatter(y_test, y_pred)
    axs[i].set_xlabel("Actual")
    axs[i].set_ylabel("Predicted")
    axs[i].set_title("Predicted vs Actual for " + position)

# Adjust the spacing and layout of the subplots
plt.tight_layout()

# Show the figure
plt.show()