In [35]:
import numpy as np
from typing import List
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from PFR_Scraper import DataScraper 

In [81]:
class DataPreprocessor:
    
    _dictionary = {'New York Giants': 'NYG',
                    'Las Vegas Raiders': 'LVR',
                    'Los Angeles Chargers': 'LAC',
                    'Denver Broncos': 'DEN',
                    'Green Bay Packers': 'GNB',
                    'Jacksonville Jaguars': 'JAX',
                    'Washington Redskins': 'WAS',
                    'Los Angeles Rams': 'LAR',
                    'Arizona Cardinals': 'ARI',
                    'Carolina Panthers': 'CAR',
                    'Baltimore Ravens': 'BAL',
                    'New York Jets': 'NYJ',
                    'Miami Dolphins': 'MIA',
                    'Minnesota Vikings': 'MIN',
                    'Oakland Raiders': 'OAK',
                    'Chicago Bears': 'CHI',
                    'New England Patriots': 'NWE',
                    'Tennessee Titans': 'TEN',
                    'New Orleans Saints': 'NOR',
                    'Cleveland Browns': 'CLE',
                    'Tampa Bay Buccaneers': 'TAM',
                    'Buffalo Bills': 'BUF',
                    'Cincinnati Bengals': 'CIN',
                    'Houston Texans': 'HOU',
                    'San Francisco 49ers': 'SFO',
                    'Atlanta Falcons': 'ATL',
                    'Washington Football Team': 'WAS',
                    'Indianapolis Colts': 'IND',
                    'Seattle Seahawks': 'SEA',
                    'Pittsburgh Steelers': 'PIT',
                    'Dallas Cowboys': 'DAL',
                    'Detroit Lions': 'DET',
                    'Philadelphia Eagles': 'PHI',
                    'Kansas City Chiefs': 'KAN'}
    
    def __init__(self, data):
        self.data = data
        
    def preprocess_data(self):
        self._print_data_header("Initial Data")
        self._flatten_multiindex_header()
        self._print_data_header("After Flattening MultiIndex Header")
        self._convert_elements_to_float_or_string()  # Make sure this line is before _calculate_yards_per_play()
        #self._handle_missing_values()
        self._print_data_header("After Handling Missing Values")
        #self._calculate_yards_per_play()
        self._print_data_header("After Calculating Yards per Play")
        self._replace_team_names(self._dictionary)
        self._print_data_header("After Renaming Columns")

        return self.data
        
    def _convert_elements_to_float_or_string(self):
        for col in self.data.columns:
            self.data[col] = self.data[col].apply(lambda x: pd.to_numeric(x, errors='coerce'))
            self.data[col] = self.data[col].apply(lambda x: str(x) if not pd.isna(x) and not isinstance(x, str) else x)
            self.data[col].replace('', np.nan, inplace=True)
            
    def _flatten_multiindex_header(self):
        if isinstance(self.data.columns, pd.MultiIndex):
            level1 = self.data.columns.get_level_values(0)
            level2 = self.data.columns.get_level_values(1)
            duplicates = level2.value_counts() > 1
            
            self.data.columns = [
                f'{col_level1}_{col_level2}' if duplicates[col_level2] else
                col_level1 if col_level2 == '' else
                col_level2
                for col_level1, col_level2 in zip(level1, level2)
            ]
        return self.data

    def _calculate_yards_per_play(self):
        # Calculate yards per play for rushing and receiving
        if 'Rushing_Att' in self.data.columns and 'Rushing_Yds' in self.data.columns:
            self.data['Y/A'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rushing_Att']) or x['Rushing_Att'] == 0) else x['Rushing_Yds'] / x['Rushing_Att'], axis=1)
        if 'Rec' in self.data.columns and 'Receiving_Yds' in self.data.columns:
            self.data['Y/R'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rec']) or x['Rec'] == 0) else x['Receiving_Yds'] / x['Rec'], axis=1)
            
    def _handle_missing_values(self, thresh=0.5):
        # Drop rows where most of the columns have string data
        str_cols = self.data.select_dtypes(include=['object']).columns
        str_counts = self.data[str_cols].apply(lambda x: sum(x.apply(lambda y: isinstance(y, str))), axis=1)
        str_prop = str_counts / len(str_cols)
        self.data = self.data[str_prop <= thresh]

        # Fill applicable missing data
        should_fill_mask = (self.data.isnull() | (self.data == 0)).sum() / len(self.data) >= thresh
        cols_to_fill = should_fill_mask[should_fill_mask == True].index.tolist()
        self.data[cols_to_fill] = self.data[cols_to_fill].fillna(0)

        # Drop rows where most of the columns are null
        self.data.dropna(axis=0, thresh=len(self.data.columns) * thresh, inplace=True)

        # Drop rows where PPR data is null (if present in dataframe)
        if 'PPR' in self.data.columns:
            self.data.dropna(subset=['PPR'], inplace=True)

        # Drop rows where Rk > 400 if Rk is a column
        if 'Rk' in self.data.columns:
            self.data = self.data[self.data['Rk'] <= 400]
        
    def _replace_team_names(self, _dictionary):
        if 'Tm' not in self.data.columns:
            return self.data

        for key, value in _dictionary.items():
            mask = self.data['Tm'].str.startswith(key)
            if mask.any():
                self.data.loc[mask, 'Tm'] = value

        return self.data
    
    def _print_data_header(self, title):
        print(f"Data Header: {title}")
        print(self.data.head())

In [53]:
class MergeAndProcess:
    
    def __init__(self, player_data, team_data):
        self.player_data = player_data
        self.team_data = team_data

    def merge(self):
        # Create a dictionary to map team data to new columns in player data
        team_data_map = {
            col: col if col in self.player_data.columns else f'Team_{col}' 
            for col in self.team_data.columns 
            if col not in ['Year', 'Tm']
        }

        # Create new columns in player data for team data
        self.player_data = self.player_data.assign(**{col: np.nan for col in team_data_map.values()})

        # Map team data to new columns in player data based on team and year
        self.player_data = self.player_data.set_index(['Year', 'Tm'])
        self.team_data = self.team_data.set_index(['Year', 'Tm'])
        self.player_data.update(self.team_data.rename(columns=team_data_map))

        # Flatten the column index
        if isinstance(self.player_data.columns, pd.MultiIndex):
            self.player_data.columns = self.player_data.columns.map(lambda x: x[1] if x[0].startswith('Team_') else x[0])

        return self.player_data.reset_index()
    
    def process(self):
        # Merge player and team data
        merged_data = self.merge()

        # Add next year PPR
        merged_data['next_year_PPR'] = merged_data.groupby('Player')['PPR'].shift(-1)

        # Add PPR per game
        merged_data['PPR_per_game'] = merged_data['PPR'] / merged_data['G']
        merged_data['PPR_per_game'].fillna(np.nan, inplace=True)

        # Convert column types
        for col in merged_data.select_dtypes(include=['object']).columns:
            try:
                merged_data[col] = pd.to_numeric(merged_data[col], errors='raise')
            except ValueError:
                merged_data[col] = merged_data[col].astype('string')

        # Drop missing values
        merged_data.loc[merged_data['Year'] != 2022] = merged_data.loc[merged_data['Year'] != 2022].dropna()

        # Replace non-alphanumeric characters in player names
        merged_data['Player'] = merged_data['Player'].str.replace(r'[^\w\s]+', '')

        # Reset index
        merged_data.reset_index(drop=True, inplace=True)

        return merged_data

In [83]:
years = list(range(2020, 2023))

In [84]:
scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()


In [86]:

player_preprocessor = DataPreprocessor(player_data)
processed_player_data = player_preprocessor.preprocess_data()


Data Header: Initial Data
  Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0 Unnamed: 3_level_0  \
                  Rk             Player                 Tm            FantPos   
0                  1   Derrick Henry *+                TEN                 RB   
1                  2      Alvin Kamara*                NOR                 RB   
2                  3       Dalvin Cook*                MIN                 RB   
3                  4    Davante Adams*+                GNB                 WR   
4                  5     Travis Kelce*+                KAN                 TE   

  Unnamed: 4_level_0 Games     Passing          ... Scoring      Fantasy  \
                 Age     G  GS     Cmp Att Yds  ...     2PM  2PP  FantPt   
0                 26    16  16       0   0   0  ...       1  NaN     314   
1                 25    15  10       0   0   0  ...     NaN  NaN     295   
2                 25    14  14       0   0   0  ...       3  NaN     294   
3                 28    14

AttributeError: Can only use .str accessor with string values!

In [87]:
player_data

Unnamed: 0,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Passing_Att,Passing_Yds,...,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,Year
0,1.0,,,,26.0,16.0,16.0,0.0,0.0,0.0,...,1.0,,314.0,333.1,341.1,323.6,184.0,1.0,1.0,2020
1,2.0,,,,25.0,15.0,10.0,0.0,0.0,0.0,...,,,295.0,377.8,383.8,336.3,165.0,2.0,2.0,2020
2,3.0,,,,25.0,14.0,14.0,0.0,0.0,0.0,...,3.0,,294.0,337.8,346.8,315.8,164.0,3.0,3.0,2020
3,4.0,,,,28.0,14.0,14.0,0.0,0.0,0.0,...,,,243.0,358.4,362.4,300.9,117.0,1.0,4.0,2020
4,5.0,,,,31.0,15.0,15.0,1.0,2.0,4.0,...,1.0,,208.0,312.8,316.8,260.3,117.0,1.0,5.0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,646.0,,,,23.0,8.0,0.0,0.0,0.0,0.0,...,,,-1.0,0.4,1.4,-0.1,,252.0,,2022
2037,647.0,,,,26.0,17.0,0.0,0.0,0.0,0.0,...,,,-1.0,-0.4,1.6,-0.9,,254.0,,2022
2038,648.0,,,,34.0,16.0,0.0,0.0,0.0,0.0,...,,,-2.0,-2.0,-1.0,-2.0,,173.0,,2022
2039,649.0,,,,28.0,3.0,0.0,0.0,0.0,0.0,...,,,-2.0,-2.0,-1.0,-2.0,,255.0,,2022


In [80]:
processed_player_data

NameError: name 'processed_player_data' is not defined

In [None]:

team_preprocessor = DataPreprocessor(team_data)
team_data = team_preprocessor.preprocess_data()

In [73]:
player_data

Unnamed: 0,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Passing_Att,Passing_Yds,...,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,Year


In [18]:
# create instance of MergeAndProcess class
merger = MergeAndProcess(player_data, team_data)

# call process_data method to merge and process data
merged_data = merger.process()


KeyError: "None of ['Tm'] are in the columns"

In [None]:

merged_data