In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
class DataScraper:
    def __init__(self, years):
        self.years = years
        self.player_urls = [f'https://www.pro-football-reference.com/years/{year}/fantasy.htm' for year in years]
        self.team_url = 'https://www.pro-football-reference.com/years/{}/'

    def scrape_data(self, urls):
        data_frames = []
        for url in urls:
            year = url.split('/')[-2]  # Extract the year from the URL
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            tables = soup.find_all('table')

            df_list = [pd.read_html(str(tables[i]))[0] for i in range(min(len(tables), 2))]
            if df_list:
                df = pd.concat(df_list)
                df['Year'] = int(year)
                
                # Remove rows containing 'AFC' or 'NFC'
                if 'Tm' in df.columns:
                    df = df[~df['Tm'].str.startswith(('AFC', 'NFC'))]
                
                data_frames.append(df)

        return pd.concat(data_frames, ignore_index=True)

    def scrape_player_data(self):
        return self.scrape_data(self.player_urls)

    def scrape_team_data(self):
        team_urls = [self.team_url.format(year) for year in self.years]
        team_data = self.scrape_data(team_urls)
        return team_data

In [None]:
class DataPreprocessor:
    def __init__(self, data):
        self.data = data

    def flatten_multiindex_header(self):
        level1 = self.data.columns.get_level_values(0)
        level2 = self.data.columns.get_level_values(1)

        # Count duplicates in level2
        duplicates = level2.value_counts() > 1

        # Create new column names
        new_columns = []
        for col_level1, col_level2 in zip(level1, level2):
            if duplicates[col_level2]:
                new_columns.append(f'{col_level1}_{col_level2}')
            elif col_level2 == '':
                new_columns.append(col_level1)
            else:
                new_columns.append(col_level2)

        # Replace the MultiIndex header with the new flattened header
        self.data.columns = new_columns
        return self.data

In [None]:
    def handle_missing_values(self):
        # Handle missing values by filling with an appropriate value (e.g., mean, median, mode) or dropping rows/columns

    def feature_engineering(self):
        # Create new features or modify existing ones based on domain knowledge

    def feature_scaling(self):
        # Scale numerical features to a standard range, e.g., using Min-Max scaling or StandardScaler from sklearn

    def one_hot_encoding(self):
        # One-hot encode categorical variables using pandas' get_dummies() method or OneHotEncoder from sklearn

    def normalize_data(self):
        # Normalize data to reduce the impact of outliers or skewed distributions, e.g., using log transformation or Box-Cox transformation

    def preprocess_data(self):
        # Call all preprocessing methods in the correct order

In [None]:
years = [2018, 2019, 2020, 2021]

scraper = DataScraper(years)
player_data = scraper.scrape_player_data()
team_data = scraper.scrape_team_data()

In [None]:
player_preprocessor = DataPreprocessor(player_data)
player_data = player_preprocessor.flatten_multiindex_header()

In [None]:
print(player_data.head())
print(team_data.head())

In [None]:
team_data

In [None]:
player_data