In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import uuid


def load_week(year, week):
    """
    Loads the html page for the overview of all games in a week
    :param year:
    :param week:
    :return:
    """
    url = 'https://www.pro-football-reference.com/years/{}/week_{}.htm'.format(year, week)
    print(url)
    page = requests.get(url).text
    return page


def get_games(years: list = None, weeks: list = None):
    """
    Wrapper function to get data on all games in a set of years and weeks
    :param years:
    :param weeks:
    :return:
    """
    output = dict(date=[], away_team=[], away_team_score=[], home_team=[], home_team_score=[], season=[], uuid=[])
    if years is None:
        years = range(2010, 2018)
    if weeks is None:
        weeks = range(1, 18)

    for year in years:
        for week in weeks:
            page = load_week(year, week)
            soup = BeautifulSoup(page)
            try:
                games = soup.find('div', 'game_summaries').find_all('div', 'game_summary')
                for game in games:
                    _uuid = uuid.uuid1()
                    date, away_team, away_team_score, home_team, home_team_score = extract_game_info(game)
                    output['date'].append(date)
                    output['away_team'].append(away_team)
                    output['away_team_score'].append(away_team_score)
                    output['home_team'].append(home_team)
                    output['home_team_score'].append(home_team_score)
                    output['season'].append(year)
                    output['uuid'].append(_uuid)
            except AttributeError:
                pass

    df = pd.DataFrame.from_dict(output, orient='columns')
    df = process_games(df)
    return df


def update_games(year: int, week: int, df: pd.DataFrame):
    """
    Used to add most recent slate of games to historical data
    :param year:
    :param week:
    :param df:
    :return:
    """
    new_games = get_games([year], [week])
    return df.append(new_games).reset_index(drop=True)


def get_next_week_games(year, week):
    """
    Get the match-ups for next week to generate predictions
    :param year:
    :param week:
    :return:
    """
    output = dict(date=[], away_team=[], home_team=[], season=[])

    page = load_week(year, week)
    soup = BeautifulSoup(page)
    # f = open('soup.html', 'w')
    # f.write(soup.prettify())  # python will convert \n to os.linesep
    # f.close()
    games = soup.find('div', 'game_summaries').find_all('div', 'game_summary')
    for game in games:
        date, away_team, away_team_score, home_team, home_team_score = extract_game_info(game, scores=False)
        output['away_team'].append(away_team)
        output['home_team'].append(home_team)
        output['date'].append(date)
        output['season'].append(year)


    df = pd.DataFrame.from_dict(output, orient='columns')
    df = process_games(df, scores=False)
    return df


def process_games(df, scores=True):
    """
    process game info after all data has been collected
    :param df:
    :param scores:
    :return:
    """
    if scores:
        df['date'] = pd.to_datetime(df['date'])
        df['away_team_score'] = pd.to_numeric(df['away_team_score'])
        df['home_team_score'] = pd.to_numeric(df['home_team_score'])
        df['one_possession'] = np.where(np.abs(df['home_team_score'] - df['away_team_score']) <= 7, 1, 0)
        df['home_team_win'] = np.where(df['home_team_score'] > df['away_team_score'], 1, 0)
        df.loc[:, 'away_outcome'] = 1 - df['home_team_win']
        df.loc[:, 'home_outcome'] = df['home_team_win']
        df.loc[df.one_possession == 1, ['away_outcome', 'home_outcome']] = .5
        df['point_diff'] = df['home_team_score'] - df['away_team_score']
        df = df.sort_values('date').reset_index()
    df['home_elo'] = 1500
    df['away_elo'] = 1500
    df['home_expected'] = 0
    df['away_expected'] = 0
    return df


def extract_game_info(game, scores=True):
    """
    Get the relevant information for a single game from the html elements
    :param game:
    :param scores:
    :return:
    """
    teams = game.find('table', 'teams')
    rows = teams.find('tbody').find_all('tr')
    date = str(rows[0].find('td').get_text())

    away_team = rows[1].find_all('td')
    away_team_name = str(away_team[0].find('a').get_text())
    if scores:
        away_team_score = str(away_team[1].get_text())
    else:
        away_team_score = None

    home_team = rows[2].find_all('td')
    home_team_name = str(home_team[0].find('a').get_text())
    if scores:
        home_team_score = str(home_team[1].get_text())
    else:
        home_team_score = None
    return date, away_team_name, away_team_score, home_team_name, home_team_score


In [14]:
games = get_games()

https://www.pro-football-reference.com/years/2010/week_1.htm




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


https://www.pro-football-reference.com/years/2010/week_2.htm
https://www.pro-football-reference.com/years/2010/week_3.htm
https://www.pro-football-reference.com/years/2010/week_4.htm
https://www.pro-football-reference.com/years/2010/week_5.htm
https://www.pro-football-reference.com/years/2010/week_6.htm
https://www.pro-football-reference.com/years/2010/week_7.htm
https://www.pro-football-reference.com/years/2010/week_8.htm
https://www.pro-football-reference.com/years/2010/week_9.htm
https://www.pro-football-reference.com/years/2010/week_10.htm
https://www.pro-football-reference.com/years/2010/week_11.htm
https://www.pro-football-reference.com/years/2010/week_12.htm
https://www.pro-football-reference.com/years/2010/week_13.htm
https://www.pro-football-reference.com/years/2010/week_14.htm
https://www.pro-football-reference.com/years/2010/week_15.htm
https://www.pro-football-reference.com/years/2010/week_16.htm
https://www.pro-football-reference.com/years/2010/week_17.htm
https://www.pro-

In [15]:
games.head()

Unnamed: 0,index,away_team,away_team_score,date,home_team,home_team_score,season,uuid,one_possession,home_team_win,away_outcome,home_outcome,point_diff,home_elo,away_elo,home_expected,away_expected
0,0,Minnesota Vikings,9,2010-09-09,New Orleans Saints,14,2010,fe791a9a-fd46-11e7-a0e0-c4b301d13749,1,1,0.5,0.5,5,1500,1500,0,0
1,13,Dallas Cowboys,7,2010-09-12,Washington Redskins,13,2010,fe7af25a-fd46-11e7-a182-c4b301d13749,1,1,0.5,0.5,6,1500,1500,0,0
2,12,Green Bay Packers,27,2010-09-12,Philadelphia Eagles,20,2010,fe7aea98-fd46-11e7-8a25-c4b301d13749,1,0,0.5,0.5,-7,1500,1500,0,0
3,10,Arizona Cardinals,17,2010-09-12,St. Louis Rams,13,2010,fe7adab0-fd46-11e7-ad4f-c4b301d13749,1,0,0.5,0.5,-4,1500,1500,0,0
4,9,Carolina Panthers,18,2010-09-12,New York Giants,31,2010,fe7ad29c-fd46-11e7-94f3-c4b301d13749,0,1,0.0,1.0,13,1500,1500,0,0


In [16]:
pd.to_pickle(games, 'games.pkl')

In [5]:
games['date'] = games['date'].apply(lambda x: x.)

AttributeError: 'Timestamp' object has no attribute 'split'

In [6]:
date = games.date.values[0]

In [7]:
date

numpy.datetime64('2010-09-09T00:00:00.000000000')

In [8]:
import datetime

In [12]:
datetime.datetime.strftime(pd.to_datetime(str(date)), '%m-%d-%y')

'09-09-10'