# Footballing

We will attempt to strip historical win/loss data from a website for future ML projects.

[The historical website](https://www.pro-football-reference.com)

[Work spreadsheet for designs](https://docs.google.com/spreadsheets/d/1MkQ_J3gGDs_vkkERELOZojhwS9Wt-1eX4vFL7VP9TQE/edit#gid=0)

In [None]:
START_YEAR = 2010
END_YEAR = 2020

In [30]:
import numpy as np
import pandas as pd

def get_tables_for(year, week):
    return pd.read_html(
        f'https://www.pro-football-reference.com/years/{year}/week_{week}.htm')
def get_games_list(table_list, year, week):
    games_list = []
    i = 0
    for table_df in table_list:
        '''
    WORK GAME TABLES
    we expect 3 rows where the first row is the
    date of the game and the 3rd column having
    "Final" and NaN
    '''
        if len(table_df.index) == 3:
            if (table_df[2][1] == "Final") & (table_df[2][2] is np.nan):
                i += 1
                # selecting only teams and scores from table
                df = table_df.iloc[1:3, 0:2]
                df.columns = ['team','score']
                df.index = [
                    'away',
                    'home'
                ]
                df['score'] = df['score'].astype('int64')
                score_away = df['score']['away']
                score_home = df['score']['home']
                was_home_win = score_home > score_away

                df.insert(2, "win", [was_home_win == False, was_home_win == True])

                # add home data
                home_data = {
                    'year': year,
                    'week': week,
                    'team': df['team']['home'],
                    'team_score': df['score']['home'],
                    'opponent': df['team']['away'],
                    'opponent_score': df['score']['away'],
                    'win': 1 if was_home_win == True else 0,
                    'home': 1,
                }
                away_data = {
                    'year': year,
                    'week': week,
                    'team': df['team']['away'],
                    'team_score': df['score']['away'],
                    'opponent': df['team']['home'],
                    'opponent_score': df['score']['home'],
                    'win': 0 if was_home_win == True else 1,
                    'home': 0,
                }
                games_list.append(home_data)
                games_list.append(away_data)

                # print(f'[{i}]------')
                # print(df)


    return games_list


def get_games_df_for_year_week(year, week):
    return pd.DataFrame(get_games_list(get_tables_for(year, week), year, week))


def get_games_df_for_year(year):
    games_df_list = []
    max_week = 17
    if year <= 1989:
        max_week = 16
    elif year == 1993:
        max_week = 18
    elif year >= 2021:
        max_week = 18

    for week in range(1, max_week + 1):
        games_df_list.append(get_games_df_for_year_week(year, week))

    year_games_df = pd.concat(games_df_list)
    return year_games_df


In [47]:
all_games_df_list = []
for year in range(START_YEAR, END_YEAR + 1):
    year_df = get_games_df_for_year(year)
    year_df.to_csv(f'games_{year}.csv', index=False)
    all_games_df_list.append(year_df)

all_games_df = pd.concat(all_games_df_list, ignore_index=True)
all_games_df.to_csv(f'all_games.csv', index=False)
