# Footballing

We will attempt to strip historical win/loss data from a website for future ML projects.

[The historical website](https://www.pro-football-reference.com)

[Work spreadsheet for designs](https://docs.google.com/spreadsheets/d/1MkQ_J3gGDs_vkkERELOZojhwS9Wt-1eX4vFL7VP9TQE/edit#gid=0)

In [2]:
import numpy as np
import pandas as pd


In [82]:
def get_tables_for(year, week):
    return pd.read_html(
        f'https://www.pro-football-reference.com/years/{year}/week_{week}.htm')


In [89]:
def get_games_list(table_list):
    games_list = []
    i = 0
    for table_df in table_list:
        '''
    WORK GAME TABLES
    we expect 3 rows where the first row is the
    date of the game and the 3rd column having
    "Final" and NaN
    '''
        if len(table_df.index) == 3:
            if (table_df[2][1] == "Final") & (table_df[2][2] is np.nan):
                i += 1
                # selecting only teams and scores from table
                df = table_df.iloc[1:3, 0:2]
                df.columns = ['team','score']
                df.index = [
                    'away',
                    'home'
                ]
                df['score'] = df['score'].astype('int64')
                score_away = df['score']['away']
                score_home = df['score']['home']
                was_home_win = score_home > score_away

                df.insert(2, "win", [was_home_win == False, was_home_win == True])

                # add home data
                week_number = 1  # m@: get correct week here
                home_data = {
                    'week': week_number,
                    'team': df['team']['home'],
                    'team_score': df['score']['home'],
                    'opponent': df['team']['away'],
                    'opponent_score': df['score']['away'],
                    'win': 1 if was_home_win == True else 0,
                    'home': 1,
                }
                away_data = {
                    'week': week_number,
                    'team': df['team']['away'],
                    'team_score': df['score']['away'],
                    'opponent': df['team']['home'],
                    'opponent_score': df['score']['home'],
                    'win': 0 if was_home_win == True else 1,
                    'home': 0,
                }
                games_list.append(home_data)
                games_list.append(away_data)

                # print(f'[{i}]------')
                # print(df)


    return games_list


In [90]:
# read in the tables from this period
page_table_list = get_tables_for(2020, 1)
len(page_table_list)


33

In [79]:
page_table_list[10]

Unnamed: 0,0,1,2
0,"Sep 13, 2020","Sep 13, 2020","Sep 13, 2020"
1,Miami Dolphins,11,Final
2,New England Patriots,21,


In [80]:
page_table_list[10].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3 non-null      object
 1   1       3 non-null      object
 2   2       2 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes


In [91]:

# now that we're done let's get a df from that
games_df = pd.DataFrame(get_games_list(table_list=page_table_list))
games_df.head()

Unnamed: 0,week,team,team_score,opponent,opponent_score,win,home
0,1,Kansas City Chiefs,34,Houston Texans,20,1,1
1,1,Houston Texans,20,Kansas City Chiefs,34,0,0
2,1,Buffalo Bills,27,New York Jets,17,1,1
3,1,New York Jets,17,Buffalo Bills,27,0,0
4,1,Atlanta Falcons,25,Seattle Seahawks,38,0,1
