In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [None]:
class DataScraper:
    def __init__(self, years):
        self.years = years
        self.player_urls = [f'https://www.pro-football-reference.com/years/{year}/fantasy.htm' for year in years]
        self.team_url = 'https://www.pro-football-reference.com/years/{}/'

    def scrape_data(self, urls):
        data_frames = []
        for url in urls:
            year = url.split('/')[-2]  # Extract the year from the URL
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            tables = soup.find_all('table')

            df_list = [pd.read_html(str(tables[i]))[0] for i in range(min(len(tables), 2))]
            if df_list:
                df = pd.concat(df_list)
                df['Year'] = float(year)
                data_frames.append(df)

        return pd.concat(data_frames, ignore_index=True)

    def scrape_player_data(self):
        return self.scrape_data(self.player_urls)

    def scrape_team_data(self):
        team_urls = [self.team_url.format(year) for year in self.years]
        team_data = self.scrape_data(team_urls)
        return team_data

In [None]:
class DataPreprocessor:
    
    dictionary = {'New York Giants': 'NYG',
 'Las Vegas Raiders': 'LVR',
 'Los Angeles Chargers': 'LAC',
 'Denver Broncos': 'DEN',
 'Green Bay Packers': 'GNB',
 'Jacksonville Jaguars': 'JAX',
 'Washington Redskins': 'WAS',
 'Los Angeles Rams': 'LAR',
 'Arizona Cardinals': 'ARI',
 'Carolina Panthers': 'CAR',
 'Baltimore Ravens': 'BAL',
 'New York Jets': 'NYJ',
 'Miami Dolphins': 'MIA',
 'Minnesota Vikings': 'MIN',
 'Oakland Raiders': 'OAK',
 'Chicago Bears': 'CHI',
 'New England Patriots': 'NWE',
 'Tennessee Titans': 'TEN',
 'New Orleans Saints': 'NOR',
 'Cleveland Browns': 'CLE',
 'Tampa Bay Buccaneers': 'TAM',
 'Buffalo Bills': 'BUF',
 'Cincinnati Bengals': 'CIN',
 'Houston Texans': 'HOU',
 'San Francisco 49ers': 'SFO',
 'Atlanta Falcons': 'ATL',
 'Washington Football Team': 'WAS',
 'Indianapolis Colts': 'IND',
 'Seattle Seahawks': 'SEA',
 'Pittsburgh Steelers': 'PIT',
 'Dallas Cowboys': 'DAL',
 'Detroit Lions': 'DET',
 'Philadelphia Eagles': 'PHI',
 'Kansas City Chiefs': 'KAN'}
    
    def __init__(self, data):
        self.data = data
        
    def try_convert_to_float(self, x):
        try:
            return float(x)
        except:
            if x == '' or pd.isna(x):
                return np.nan
            else:
                return x

    def convert_columns_to_float(self):
        # Convert all columns to float if possible
        self.data = self.data.applymap(self.try_convert_to_float)
        self.data = self.data.astype(float, errors='ignore')

    def flatten_multiindex_header(self):
        # Flatten multi-level column index to single level column index if it exists
        if isinstance(self.data.columns, pd.MultiIndex):
            level1 = self.data.columns.get_level_values(0)
            level2 = self.data.columns.get_level_values(1)

            # Count duplicates in level2
            duplicates = level2.value_counts() > 1

            # Create new column names
            new_columns = []
            for col_level1, col_level2 in zip(level1, level2):
                if duplicates[col_level2]:
                    new_columns.append(f'{col_level1}_{col_level2}')
                elif col_level2 == '':
                    new_columns.append(col_level1)
                else:
                    new_columns.append(col_level2)

            # Replace the MultiIndex header with the new flattened header
            self.data.columns = new_columns
        return self.data
 
    def calculate_yards_per_attempt(self):
    # Calculate yards per rushing attempt
        if 'Rushing_Att' in self.data.columns and 'Rushing_Yds' in self.data.columns:
            self.data['Y/A'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rushing_Att']) or x['Rushing_Att'] == 0) else x['Rushing_Yds'] / x['Rushing_Att'], axis=1)

    def calculate_yards_per_reception(self):
    # Calculate yards per reception
        if 'Rec' in self.data.columns and 'Receiving_Yds' in self.data.columns:
            self.data['Y/R'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rec']) or x['Rec'] == 0) else x['Receiving_Yds'] / x['Rec'], axis=1)
            
    def handle_missing_values(self, thresh=0.5):
        
         # Drop columns where most of the rows are null
        self.data = self.data.drop(columns=self.data.columns[self.data.isnull().mean() > thresh], errors='ignore')

        # Drop rows where most of the columns are null
        self.data = self.data.dropna(thresh=thresh*len(self.data.columns))

        # Drop rows where PPR data is null (if present in dataframe)
        if 'PPR' in self.data.columns:
            self.data = self.data.dropna(subset=['PPR'])

        # Drop rows where most of the columns have string data
        self.data = self.data[~(np.sum(np.vectorize(isinstance)(self.data.values, str), axis=1) > thresh*len(self.data.columns))]
    
    def replace_team_names(self, dictionary):
        for key, value in dictionary.items():
            mask = self.data['Tm'].str.startswith(key)
            if mask.any():
                self.data.loc[mask, 'Tm'] = value
        return self.data
        
    def feature_engineering(self):
        # Create new features or modify existing ones based on domain knowledge
        pass

    def feature_scaling(self):
        # Scale numerical features to a standard range, e.g., using Min-Max scaling or StandardScaler from sklearn
        pass

    def normalize_data(self):
        # Normalize data to reduce the impact of outliers or skewed distributions, e.g., using log transformation or Box-Cox transformation
        pass

    def preprocess_data(self):
        self.convert_columns_to_float()
        self.flatten_multiindex_header()
        self.handle_missing_values()
        self.replace_team_names(dictionary=self.dictionary)  # pass dictionary argument
        self.calculate_yards_per_attempt()
        self.calculate_yards_per_reception()
        # Call other preprocessing methods in the correct order
        return self.data

In [None]:
class MergeData:
    def __init__(self, player_data, team_data):
        self.player_data = player_data
        self.team_data = team_data

    def add_team_data(self):
        # Create a dictionary to map team data to new columns in player data
        team_data_map = {}
        for col in self.team_data.columns:
            if col not in ['Year', 'Tm']:
                if col in self.player_data.columns:
                    team_data_map[col] = col
                else:
                    team_data_map[col] = f'Team_{col}'

        # Create new columns in player data for team data
        for col in team_data_map.values():
            self.player_data[col] = np.nan

        # Map team data to new columns in player data based on team and year
        self.player_data.set_index(['Year', 'Tm'], inplace=True)
        self.team_data.set_index(['Year', 'Tm'], inplace=True)
        self.player_data.update(self.team_data.rename(columns=team_data_map))

        # Flatten the column index
        if isinstance(self.player_data.columns, pd.MultiIndex):
            level1 = self.player_data.columns.get_level_values(0)
            level2 = self.player_data.columns.get_level_values(1)
            new_columns = []
            for col_level1, col_level2 in zip(level1, level2):
                if col_level1.startswith('Team_'):
                    new_columns.append(col_level2)
                else:
                    new_columns.append(col_level1)
            self.player_data.columns = new_columns

        return self.player_data.reset_index()

    def add_and_flatten(self):
        self.add_team_data()
        #self._flatten_multiindex_header()
        
        return self.player_data.reset_index()

In [None]:
years = list(range(2017, 2022))

In [None]:
scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()

player_preprocessor = DataPreprocessor(player_data)
player_data = player_preprocessor.preprocess_data()

team_preprocessor = DataPreprocessor(team_data)
team_data = team_preprocessor.preprocess_data()

# Create a PlayerTeamData object
merger = MergeData(player_data, team_data)

# Call the add_and_flatten method to merge the data and flatten the column index
merged_data = merger.add_and_flatten()

merged_data['Player'] = merged_data['Player'].str.replace(r'[^\w\s]+', '')
merged_data = merged_data.dropna()

In [None]:
def add_next_year_ppr(data):
    data['next_year_PPR'] = np.nan
    
    # Group the data by player and sort by year
    data_grouped = data.sort_values(by=['Player', 'Year']).groupby('Player')
    
    # Iterate over each group and calculate next year PPR
    for name, group in data_grouped:
        group['next_year_PPR'] = group['PPR'].shift(-1)
        data.update(group)
        
    return data

In [None]:
def add_ppr_per_game(data):
    data['PPR_per_game'] = np.nan
    
    try:
        data['PPR_per_game'] = data['PPR'] / data['G']
    except:
        pass
    
    return data

In [None]:
merged_data = add_next_year_ppr(merged_data)
merged_data = add_ppr_per_game(merged_data)
merged_data = merged_data.dropna()

In [None]:
merged_data

In [None]:
for position in unique positions, create new dataframe with all rows of position.
create a loop that iterates over each year, and in each iteration, you train a new LSTM network on the data from that year and all previous years. Then, you can use the trained network to make predictions for the current year. You can save the trained networks for each year, so you can reuse them in future iterations.
I want the input data to be all numerical data and the output (predicted value) is next_year_PPR