In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
class DataScraper:
    def __init__(self, years):
        self.years = years
        self.player_urls = [f'https://www.pro-football-reference.com/years/{year}/fantasy.htm' for year in years]
        self.team_url = 'https://www.pro-football-reference.com/years/{}/'

    def scrape_data(self, urls):
        data_frames = []
        for url in urls:
            year = url.split('/')[-2]  # Extract the year from the URL
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            tables = soup.find_all('table')

            df_list = [pd.read_html(str(tables[i]))[0] for i in range(min(len(tables), 2))]
            if df_list:
                df = pd.concat(df_list)
                df['Year'] = float(year)
                data_frames.append(df)

        return pd.concat(data_frames, ignore_index=True)

    def scrape_player_data(self):
        return self.scrape_data(self.player_urls)

    def scrape_team_data(self):
        team_urls = [self.team_url.format(year) for year in self.years]
        team_data = self.scrape_data(team_urls)
        return team_data

In [None]:
class DataPreprocessor:
    
    dictionary = {'New York Giants': 'NYG',
 'Las Vegas Raiders': 'LVR',
 'Los Angeles Chargers': 'LAC',
 'Denver Broncos': 'DEN',
 'Green Bay Packers': 'GNB',
 'Jacksonville Jaguars': 'JAX',
 'Washington Redskins': 'WAS',
 'Los Angeles Rams': 'LAR',
 'Arizona Cardinals': 'ARI',
 'Carolina Panthers': 'CAR',
 'Baltimore Ravens': 'BAL',
 'New York Jets': 'NYJ',
 'Miami Dolphins': 'MIA',
 'Minnesota Vikings': 'MIN',
 'Oakland Raiders': 'OAK',
 'Chicago Bears': 'CHI',
 'New England Patriots': 'NWE',
 'Tennessee Titans': 'TEN',
 'New Orleans Saints': 'NOR',
 'Cleveland Browns': 'CLE',
 'Tampa Bay Buccaneers': 'TAM',
 'Buffalo Bills': 'BUF',
 'Cincinnati Bengals': 'CIN',
 'Houston Texans': 'HOU',
 'San Francisco 49ers': 'SFO',
 'Atlanta Falcons': 'ATL',
 'Washington Football Team': 'WAS',
 'Indianapolis Colts': 'IND',
 'Seattle Seahawks': 'SEA',
 'Pittsburgh Steelers': 'PIT',
 'Dallas Cowboys': 'DAL',
 'Detroit Lions': 'DET',
 'Philadelphia Eagles': 'PHI',
 'Kansas City Chiefs': 'KAN'}
    
    def __init__(self, data):
        self.data = data
        
    def try_convert_to_float(self, x):
        try:
            return float(x)
        except:
            if x == '' or pd.isna(x):
                return np.nan
            else:
                return x

    def convert_columns_to_float(self):
        # Convert all columns to float if possible
        self.data = self.data.applymap(self.try_convert_to_float)
        self.data = self.data.astype(float, errors='ignore')

    def flatten_multiindex_header(self):
        # Flatten multi-level column index to single level column index if it exists
        if isinstance(self.data.columns, pd.MultiIndex):
            level1 = self.data.columns.get_level_values(0)
            level2 = self.data.columns.get_level_values(1)

            # Count duplicates in level2
            duplicates = level2.value_counts() > 1

            # Create new column names
            new_columns = []
            for col_level1, col_level2 in zip(level1, level2):
                if duplicates[col_level2]:
                    new_columns.append(f'{col_level1}_{col_level2}')
                elif col_level2 == '':
                    new_columns.append(col_level1)
                else:
                    new_columns.append(col_level2)

            # Replace the MultiIndex header with the new flattened header
            self.data.columns = new_columns
        return self.data
 
    def calculate_yards_per_attempt(self):
    # Calculate yards per rushing attempt
        if 'Rushing_Att' in self.data.columns and 'Rushing_Yds' in self.data.columns:
            self.data['Y/A'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rushing_Att']) or x['Rushing_Att'] == 0) else x['Rushing_Yds'] / x['Rushing_Att'], axis=1)

    def calculate_yards_per_reception(self):
    # Calculate yards per reception
        if 'Rec' in self.data.columns and 'Receiving_Yds' in self.data.columns:
            self.data['Y/R'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rec']) or x['Rec'] == 0) else x['Receiving_Yds'] / x['Rec'], axis=1)
            
    def handle_missing_values(self, thresh=0.5):
    
        # Drop rows where most of the columns have string data
        self.data = self.data[~(np.sum(np.vectorize(isinstance)(self.data.values, str), axis=1) > thresh * len(self.data.columns))]

        def should_fill(col):
            na_and_zero_count = (col.apply(lambda x: pd.isna(x) or x == 0)).sum()
            return na_and_zero_count / len(col) >= 0.7

        should_fill_mask = self.data.apply(should_fill)
        fill_mask = self.data.apply(lambda col: col.isna() & should_fill_mask[col.name])
        self.data = self.data.mask(fill_mask, 0.0)

        # Drop columns where most of the rows are null
        self.data = self.data.drop(columns=self.data.columns[self.data.isnull().mean() > thresh], errors='ignore')

        # Drop rows where most of the columns are null
        self.data = self.data.dropna(thresh=thresh * len(self.data.columns))

        # Drop rows where PPR data is null (if present in dataframe)
        if 'PPR' in self.data.columns:
            self.data = self.data.dropna(subset=['PPR'])

         # Drop rows where Rk > 350 if Rk is a column
        if 'Rk' in self.data.columns:
            self.data = self.data[self.data['Rk'] <= 400]

    def replace_team_names(self, dictionary):
        for key, value in dictionary.items():
            mask = self.data['Tm'].str.startswith(key)
            if mask.any():
                self.data.loc[mask, 'Tm'] = value
        return self.data
        
    def feature_engineering(self):
        # Create new features or modify existing ones based on domain knowledge
        pass

    def feature_scaling(self):
        # Scale numerical features to a standard range, e.g., using Min-Max scaling or StandardScaler from sklearn
        pass

    def normalize_data(self):
        # Normalize data to reduce the impact of outliers or skewed distributions, e.g., using log transformation or Box-Cox transformation
        pass

    def preprocess_data(self):
        self.convert_columns_to_float()
        self.flatten_multiindex_header()
        self.handle_missing_values()
        self.replace_team_names(dictionary=self.dictionary)  # pass dictionary argument
        self.calculate_yards_per_attempt()
        self.calculate_yards_per_reception()
        # Call other preprocessing methods in the correct order
        return self.data

In [None]:
class MergeAndProcess:
    def __init__(self, player_data, team_data):
        self.player_data = player_data
        self.team_data = team_data

    def merge(self):
        # Create a dictionary to map team data to new columns in player data
        team_data_map = {}
        for col in self.team_data.columns:
            if col not in ['Year', 'Tm']:
                if col in self.player_data.columns:
                    team_data_map[col] = col
                else:
                    team_data_map[col] = f'Team_{col}'

        # Create new columns in player data for team data
        for col in team_data_map.values():
            self.player_data[col] = np.nan

        # Map team data to new columns in player data based on team and year
        self.player_data.set_index(['Year', 'Tm'], inplace=True)
        self.team_data.set_index(['Year', 'Tm'], inplace=True)
        self.player_data.update(self.team_data.rename(columns=team_data_map))

        # Flatten the column index
        if isinstance(self.player_data.columns, pd.MultiIndex):
            level1 = self.player_data.columns.get_level_values(0)
            level2 = self.player_data.columns.get_level_values(1)
            new_columns = []
            for col_level1, col_level2 in zip(level1, level2):
                if col_level1.startswith('Team_'):
                    new_columns.append(col_level2)
                else:
                    new_columns.append(col_level1)
            self.player_data.columns = new_columns

        return self.player_data.reset_index()

    def process(self):
        
        # Merge player and team data
        merged_data = self.merge()
        
        # Add next year PPR
        merged_data['next_year_PPR'] = merged_data.groupby('Player')['PPR'].shift(-1)

        # Add PPR per game
        merged_data['PPR_per_game'] = np.nan
        try:
            merged_data['PPR_per_game'] = merged_data['PPR'] / merged_data['G']
        except:
            pass
        
        # Convert column types
        for col in merged_data.columns:
            if merged_data[col].dtype == 'object':
                try:
                    merged_data[col] = pd.to_numeric(merged_data[col], errors='raise').astype('float')
                except ValueError:
                    merged_data[col] = merged_data[col].astype('string')
            else:
                dtype = merged_data[col].dtype
                merged_data[col] = merged_data[col].astype(dtype)

        # Drop missing values
        merged_data = merged_data.dropna()
        
        # Reset index
        merged_data = merged_data.reset_index(drop=True)
        
        # Replace non-alphanumeric characters in player names
        merged_data['Player'] = merged_data['Player'].str.replace(r'[^\w\s]+', '')
        
        return merged_data

In [None]:
years = list(range(2013, 2023))

In [None]:
scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()

player_preprocessor = DataPreprocessor(player_data)
player_data = player_preprocessor.preprocess_data()

team_preprocessor = DataPreprocessor(team_data)
team_data = team_preprocessor.preprocess_data()

In [None]:
# create instance of MergeAndProcess class
merger = MergeAndProcess(player_data, team_data)

# call process_data method to merge and process data
merged_data = merger.process()

In [None]:
# Split the merged_data DataFrame into training/validation (2015-2020) and testing (2021)
train_val_data = merged_data[merged_data['Year'] < merged_data['Year'].max()]
test_data = merged_data[merged_data['Year'] == merged_data['Year'].max()]

In [None]:
train_val_data

In [None]:
test_data

In [None]:
def preprocess_autoencoder_data(data):
    # Remove non-numeric columns
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Normalize numeric data using Min-Max scaling
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(numeric_data)
    
    return normalized_data, scaler

In [None]:
def autoencoder(input_dim, encoding_dim=64):
    # Create the input layer with the specified input_dim
    input_layer = Input(shape=(input_dim,))

    # encoder
    encoded = Dense(encoding_dim, activation='relu')(input_layer)

    # decoder 
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    # Create the autoencoder model
    autoencoder = Model(input_layer, decoded)

    # Create the encoder model (this will be used later for encoding the data)
    encoder = Model(input_layer, encoded)

    # Compile the autoencoder model
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder, encoder

In [None]:
# Get the unique positions in the dataset
positions = merged_data['FantPos'].unique()

# Create a dictionary to store data for each position
position_dfs = {position: train_val_data[train_val_data['FantPos'] == position] for position in positions}

# Initialize dictionaries to store models and performance metrics
autoencoders = {}
regression_models = {}
mae_scores = {}

for position in positions:
    # Get the position-specific data
    train_val_data_pos = position_dfs[position]
    test_data_pos = test_data[test_data['FantPos'] == position]

    ## Use RFE to select the top 10 features for the position-specific data
    X = train_val_data_pos.drop(['next_year_PPR'], axis=1)
    X = X.select_dtypes(exclude=['string']) # exclude columns with string data
    y = train_val_data_pos['next_year_PPR']

    rfe = RFE(RandomForestRegressor(), n_features_to_select=20)
    rfe.fit(X, y)
    top_20_features = X.columns[rfe.support_].tolist()
    X = X[top_20_features]
    # Modify the train_val_data_pos and test_data_pos to include only the top features
    train_val_data_pos = train_val_data_pos[['PPR', 'next_year_PPR'] + top_20_features]
    test_data_pos = test_data_pos[['PPR', 'next_year_PPR'] + top_20_features]

    # Preprocess the data for the autoencoder
    train_val_normalized_pos, train_val_scaler_pos = preprocess_autoencoder_data(train_val_data_pos)
    test_normalized_pos, test_scaler_pos = preprocess_autoencoder_data(test_data_pos)
    
    # Define the autoencoder architecture for the position
    autoencoder_pos, encoder_pos = autoencoder(train_val_normalized_pos.shape[1]) 
    
    # Train the autoencoder using the position-specific train_val_normalized dataset
    autoencoder_pos.fit(train_val_normalized_pos, train_val_normalized_pos, epochs=100, batch_size=32)

    # Encode the position-specific train_val_normalized and test_normalized datasets
    train_val_encoded_pos = encoder_pos.predict(train_val_normalized_pos)
    test_encoded_pos = encoder_pos.predict(test_normalized_pos)

    # Train a regression model using the encoded representation of the position-specific train_val_normalized dataset
    regression_model_pos = RandomForestRegressor()
    regression_model_pos.fit(train_val_encoded_pos, train_val_data_pos['next_year_PPR'])

    # Evaluate the model's performance on the encoded position-specific test_normalized dataset
    predictions_pos = regression_model_pos.predict(test_encoded_pos)
    actual_values_pos = test_data_pos['next_year_PPR']

    # Calculate evaluation metrics, e.g., mean_absolute_error
    mae_pos = mean_absolute_error(actual_values_pos, predictions_pos)

    # Save the autoencoder, regression model, and performance metrics for the position
    autoencoders[position] = autoencoder_pos
    regression_models[position] = regression_model_pos
    mae_scores[position] = mae_pos

# Print the evaluation metrics for each position
for position, mae_pos in mae_scores.items():
    print(f'Mean Absolute Error for {position}: {mae_pos}')

In [None]:
# Create a new dataframe to store the test data and predictions
all_test_data = pd.DataFrame()

# Iterate over each position in the positions list
for position in positions:
    # Get the position-specific test_normalized dataset
    test_normalized_pos = test_normalized[position]

    # Encode the position-specific test_normalized dataset
    encoder_pos = autoencoders[position].get_layer(index=1) # Get the encoder from the autoencoder
    test_encoded_pos = encoder_pos(test_normalized_pos)
    
    # Use the trained regression model to predict the next_year_PPR values
    predictions_pos = regression_models[position].predict(test_encoded_pos)

    # Add the predictions to the position-specific test_data dataframe
    test_data_pos = test_data[position]
    test_data_pos['predictions'] = predictions_pos

    # Append the position-specific test_data dataframe to the combined test_data dataframe
    all_test_data = all_test_data.append(test_data_pos)

# Merge the all_test_data dataframe with the original merged_data dataframe
merged_data_with_predictions = pd.merge(merged_data, all_test_data[['predictions']], left_index=True, right_index=True, how='left')

# Visualize the predictions against the actual values
plt.figure(figsize=(12, 6))
sns.scatterplot(x='next_year_PPR', y='predictions', data=merged_data_with_predictions, hue='FantPos', palette='viridis')
plt.xlabel('Actual Next Year PPR')
plt.ylabel('Predicted Next Year PPR')
plt.title('Predicted vs. Actual Next Year PPR')
plt.show()