In [8]:
import numpy as np
from typing import List
from bs4 import BeautifulSoup
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from PFR_Scraper import DataScraper 

In [9]:
class DataPreprocessor:
    
    _dictionary = {'New York Giants': 'NYG',
 'Las Vegas Raiders': 'LVR',
 'Los Angeles Chargers': 'LAC',
 'Denver Broncos': 'DEN',
 'Green Bay Packers': 'GNB',
 'Jacksonville Jaguars': 'JAX',
 'Washington Redskins': 'WAS',
 'Los Angeles Rams': 'LAR',
 'Arizona Cardinals': 'ARI',
 'Carolina Panthers': 'CAR',
 'Baltimore Ravens': 'BAL',
 'New York Jets': 'NYJ',
 'Miami Dolphins': 'MIA',
 'Minnesota Vikings': 'MIN',
 'Oakland Raiders': 'OAK',
 'Chicago Bears': 'CHI',
 'New England Patriots': 'NWE',
 'Tennessee Titans': 'TEN',
 'New Orleans Saints': 'NOR',
 'Cleveland Browns': 'CLE',
 'Tampa Bay Buccaneers': 'TAM',
 'Buffalo Bills': 'BUF',
 'Cincinnati Bengals': 'CIN',
 'Houston Texans': 'HOU',
 'San Francisco 49ers': 'SFO',
 'Atlanta Falcons': 'ATL',
 'Washington Football Team': 'WAS',
 'Indianapolis Colts': 'IND',
 'Seattle Seahawks': 'SEA',
 'Pittsburgh Steelers': 'PIT',
 'Dallas Cowboys': 'DAL',
 'Detroit Lions': 'DET',
 'Philadelphia Eagles': 'PHI',
 'Kansas City Chiefs': 'KAN'}
    
    def __init__(self, data):
        self.data = data
        
    def preprocess_data(self):
        #Preprocess the data by calling helper methods in the correct order.
        self._convert_columns_to_float()
        self._flatten_multiindex_header()
        self._handle_missing_values()
        self._replace_team_names(self._dictionary)
        self._calculate_yards_per_play()
        return self.data
        
    def _convert_columns_to_float(self):
        # Convert columns to float if possible
        self.data = self.data.applymap(lambda x: pd.to_numeric(x, errors='ignore'))

    def _flatten_multiindex_header(self):
         # Flatten multi-level column index to single level column index if it exists
        if isinstance(self.data.columns, pd.MultiIndex):
            self.data.columns = ['_'.join(col).strip('_') if col[1] else col[0] for col in self.data.columns]

    def _calculate_yards_per_play(self):
        # Calculate yards per play for rushing and receiving
        if 'Rushing_Att' in self.data.columns and 'Rushing_Yds' in self.data.columns:
            self.data['Y/A'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rushing_Att']) or x['Rushing_Att'] == 0) else x['Rushing_Yds'] / x['Rushing_Att'], axis=1)
        if 'Rec' in self.data.columns and 'Receiving_Yds' in self.data.columns:
            self.data['Y/R'] = self.data.apply(lambda x: 0 if (pd.isna(x['Rec']) or x['Rec'] == 0) else x['Receiving_Yds'] / x['Rec'], axis=1)
            
    def _handle_missing_values(self, thresh=0.5):
        # Drop rows where most of the columns have string data
        str_cols = self.data.select_dtypes(include=['object']).columns
        str_counts = self.data[str_cols].apply(lambda x: sum(x.apply(lambda y: isinstance(y, str))), axis=1)
        str_prop = str_counts / len(str_cols)
        self.data = self.data[str_prop <= thresh]

        # Fill applicable missing data
        should_fill_mask = (self.data.isnull() | (self.data == 0)).sum() / len(self.data) >= thresh
        cols_to_fill = should_fill_mask[should_fill_mask == True].index.tolist()
        self.data[cols_to_fill] = self.data[cols_to_fill].fillna(0)
        
         # Drop columns where most of the rows are null
        self.data.dropna(axis=1, thresh=len(self.data) * (1 - thresh), inplace=True)

        # Drop rows where most of the columns are null
        self.data.dropna(axis=0, thresh=len(self.data.columns) * thresh, inplace=True)

        # Drop rows where PPR data is null (if present in dataframe)
        if 'PPR' in self.data.columns:
            self.data.dropna(subset=['PPR'], inplace=True)

        # Drop rows where Rk > 400 if Rk is a column
        if 'Rk' in self.data.columns:
            self.data = self.data[self.data['Rk'] <= 400]
        
    def _replace_team_names(self, _dictionary):
        if 'Tm' not in self.data.columns:
            return self.data

        for key, value in _dictionary.items():
            mask = self.data['Tm'].str.startswith(key)
            if mask.any():
                self.data.loc[mask, 'Tm'] = value

        return self.data

In [10]:
class MergeAndProcess:
    
    def __init__(self, player_data, team_data):
        self.player_data = player_data
        self.team_data = team_data

    def merge(self):
        # Create a dictionary to map team data to new columns in player data
        team_data_map = {
            col: col if col in self.player_data.columns else f'Team_{col}' 
            for col in self.team_data.columns 
            if col not in ['Year', 'Tm']
        }

        # Create new columns in player data for team data
        self.player_data = self.player_data.assign(**{col: np.nan for col in team_data_map.values()})

        # Map team data to new columns in player data based on team and year
        self.player_data = self.player_data.set_index(['Year', 'Tm'])
        self.team_data = self.team_data.set_index(['Year', 'Tm'])
        self.player_data.update(self.team_data.rename(columns=team_data_map))

        # Flatten the column index
        if isinstance(self.player_data.columns, pd.MultiIndex):
            self.player_data.columns = self.player_data.columns.map(lambda x: x[1] if x[0].startswith('Team_') else x[0])

        return self.player_data.reset_index()
    
    def process(self):
        # Merge player and team data
        merged_data = self.merge()

        # Add next year PPR
        merged_data['next_year_PPR'] = merged_data.groupby('Player')['PPR'].shift(-1)

        # Add PPR per game
        merged_data['PPR_per_game'] = merged_data['PPR'] / merged_data['G']
        merged_data['PPR_per_game'].fillna(np.nan, inplace=True)

        # Convert column types
        for col in merged_data.select_dtypes(include=['object']).columns:
            try:
                merged_data[col] = pd.to_numeric(merged_data[col], errors='raise')
            except ValueError:
                merged_data[col] = merged_data[col].astype('string')

        # Drop missing values
        merged_data.loc[merged_data['Year'] != 2022] = merged_data.loc[merged_data['Year'] != 2022].dropna()

        # Replace non-alphanumeric characters in player names
        merged_data['Player'] = merged_data['Player'].str.replace(r'[^\w\s]+', '')

        # Reset index
        merged_data.reset_index(drop=True, inplace=True)

        return merged_data

In [11]:
years = list(range(2013, 2023))

In [12]:
scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()


NameError: name 'SoupStrainer' is not defined

In [None]:

player_preprocessor = DataPreprocessor(player_data)
player_data = player_preprocessor.preprocess_data()


In [None]:

team_preprocessor = DataPreprocessor(team_data)
team_data = team_preprocessor.preprocess_data()

In [None]:
# create instance of MergeAndProcess class
merger = MergeAndProcess(player_data, team_data)

# call process_data method to merge and process data
merged_data = merger.process()


In [None]:

merged_data