In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import uuid
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
header_class = 'component_head'
table_class  = 'rt_railbox_border2'
div_class = 'SLTables1'

In [3]:
columns = ['date', 'dog', 'dog_spread_line', 'fav', 'fav_spread_line', 'ml_dog_line', 'ml_fav_line',
           'over', 'spread', 'time', 'total', 'under', 'bookie', 'uuid']

In [4]:
def extract_games_from_df(df):
    games = pd.DataFrame(columns=columns)
    for i in range(len(df)):
        row = df.iloc[i, :]
        game = extract_tables_from_game(row.home_team, row.away_team, row.date, row.uuid)
        games = games.append(game, ignore_index=True)
    return games

In [5]:
def extract_tables_from_game(home_team, away_team, date, game_uuid):
    url = 'http://www.vegasinsider.com/nfl/odds/offshore/line-movement/{}-@-{}.cfm/date/{}'.format(away_team, home_team, date)
    print(url)
    page = requests.get(url).text
    soup = BeautifulSoup(page)
    game_info = soup.find('div', class_=div_class).find_all('table')[1].find_all('tr')
    game_date = game_info[0].find('td').get_text().replace(u'\xa0', u'').split(':')[1]
    game_time = game_info[1].find('td').get_text().replace(u'\xa0', u'').split('e:')[1]
    game_datetime = datetime.datetime.strptime(game_date + ' ' + game_time, '%A, %B %d, %Y %I:%M %p ')
    tables = soup.find_all('table', class_=table_class)
    headers = soup.find_all('tr', class_=header_class)
    game = pd.DataFrame(columns=columns)
    for t, h in zip(tables, headers):
        table = extract_data_from_table(t, h, game_datetime.year)
        game = game.append(table, ignore_index=True)
    game['game_uuid'] = game_uuid
    game['game_datetime'] = game_datetime
    return game

In [6]:
def extract_data_from_table(t, h, year):
    bookie = h.find('td').get_text().strip()
    table = pd.DataFrame(columns=columns)
    rows = t.find_all('tr')
    for r in rows[2:]:
        data = r.find_all('td')
        ret  = extract_data_from_cell(data, year)
        table = table.append(ret, ignore_index=True)
    table['bookie'] = bookie
    return table

In [7]:
def extract_data_from_cell(d, year):
    """
    Extract a single line data point from a table of lines from a single bookie for a single game
    """
    date = d[0].get_text().strip()
    time = d[1].get_text().strip()
    dt_str = date + ' ' + str(year) + ' ' + time
    dt = datetime.datetime.strptime(dt_str, '%m/%d %Y %I:%M%p')
    
    # Getting the moneyline info is easy
    try:
        fav, ml_fav_line = d[2].get_text().strip().split('-')
        ml_fav_line = int(ml_fav_line) * -1
        dog, ml_dog_line = d[3].get_text().strip().split('+')
    except ValueError:  # The bookie may not have ML posted yet
        fav, ml_fav_line = (np.nan, np.nan)
        dog, ml_dog_line = (np.nan, np.nan)    
    
    try:
        # The spread line for the favorite in the game. This allows us to always split on '-' to get the number
        spread, fav_spread_line = d[4].get_text().strip().split(' ')
        _, spread = spread.split('-')
        # Since we already know the spread and the fav/dog teams,
        # all we need is the odds line for betting the dog on the spread
        _, dog_spread_line = d[5].get_text().strip().split(' ')
    except ValueError:
        spread, fav_spread_line, dog_spread_line = (np.nan, np.nan, np.nan)
    
    # Get point total data here
    try:
        total, over = d[6].get_text().strip().split(' ')
        _,    under = d[7].get_text().strip().split(' ')
    except ValueError:
        total, over, under = (np.nan, np.nan, np.nan)
        
    if ml_dog_line == 'XX':
        ml_dog_line = np.nan
    if ml_fav_line == 'XX':
        ml_fav_line = np.nan
    if spread == 'XX':
        spread = np.nan
    if fav_spread_line == 'XX':
        fav_spread_line = np.nan
    if dog_spread_line == 'XX':
        dog_spread_line = np.nan
    if total == 'XX':
        total = np.nan
    if over == 'XX':
        over = np.nan
    if under == 'XX':
        under = np.nan
    
    return pd.Series(dict(date=date, time=time, fav=fav, dog=dog, ml_fav_line=ml_fav_line,
                          ml_dog_line=float(ml_dog_line), spread=float(spread), fav_spread_line=float(fav_spread_line),
                          dog_spread_line=float(dog_spread_line), total=float(total), over=float(over),
                          under=float(under), line_datetime=dt))

In [8]:
def line_to_prob(line):
    if line < 0:
        return -line / (-line + 100.)
        # Do some stuff
    elif line > 0:
        return 100. / (line + 100.)
        # Do some other stuff

In [9]:
def calc_concensus(lines):
    lines = lines.sort_values('line_datetime')
    lines['concensus_ml_fav'] = np.nan
    lines['concensus_ml_dog'] = np.nan
    lines['ml_dog_std'] = np.nan
    lines['ml_fav_std'] = np.nan
#     lines['concensus_spread_fav'] = np.nan
#     lines['concensus_spread_dog'] = np.nan
#     lines['concensus_spread'] = np.nan
#     lines['concensus_over'] = np.nan
#     lines['concensus_under'] = np.nan
#     lines['concensus_total'] = np.nan
    for key, grp in lines.groupby('game_uuid'):
        grp = grp[grp.line_datetime < grp.game_datetime]
        for idx, label in enumerate(grp.index):
            _grp = grp.iloc[:idx+1, :].drop_duplicates('bookie', keep='last')
            lines.loc[label, 'concensus_ml_fav'] = _grp['ml_fav_prob'].mean()
            lines.loc[label, 'concensus_ml_dog'] = _grp['ml_dog_prob'].mean()
            lines.loc[label, 'ml_dog_std'] = _grp['ml_dog_prob'].std()
            lines.loc[label, 'ml_fav_std'] = _grp['ml_fav_prob'].std()
    lines = lines.dropna()
    lines['ml_fav_z'] = (lines['ml_fav_prob'] - lines['concensus_ml_fav']) / lines['ml_fav_std']
    lines['ml_dog_z'] = (lines['ml_dog_prob'] - lines['concensus_ml_dog']) / lines['ml_dog_std']
    return lines

In [10]:
def abbrev_to_name(abbrev):
    mapping = {
        'PHI': 'eagles',
        'JAC': 'jaguars',
        'NWE': 'patriots',
        'WAS': 'redskins',
        'ATL': 'falcons',
        'DAL': 'cowboys',
        'TEN': 'titans',
        'NYJ': 'jets',
        'NYG': 'giants',
        'CAR': 'panthers',
        'HOU': 'texans',
        'DEN': 'broncos',
        'IND': 'colts',
        'DET': 'lions',
        'KAN': 'chiefs',
        'LAC': 'chargers',
        'SFO': '49ers',
        'DAL': 'cowboys',
        'NOR': 'saints',
        'SEA': 'seahawks',
        'STL': 'rams',
        'BAL': 'ravens',
        'MIN': 'vikings',
        'BUF': 'bills',
        'ARI': 'cardinals',
        'CHI': 'bears',
        'PIT': 'steelers',
        'GNB': 'packers',
        'TAM': 'buccaneers',
        'CIN': 'bengals',
        'OAK': 'raiders',
        'MIA': 'dolphins',
        'CLE': 'browns'
    }
    return mapping[abbrev]

In [11]:
def calc_payout(amount, line):
    if line > 0:
        return amount * (line / 100.)
    elif line < 0:
        return amount / (np.abs(line) / 100.)

In [289]:
games.home_team.unique()

array(['saints', 'redskins', 'eagles', 'rams', 'giants', 'jaguars',
       'patriots', 'seahawks', 'steelers', 'bills', 'buccaneers', 'bears',
       'texans', 'titans', 'jets', 'chiefs', 'vikings', 'colts',
       'chargers', 'raiders', 'cowboys', 'broncos', 'panthers', 'packers',
       'bengals', 'browns', 'lions', 'falcons', '49ers', 'dolphins',
       'cardinals', 'ravens'], dtype=object)

In [364]:
lines.fav.unique()

array([nan, 'HOU', 'NWE', 'DEN', 'IND', 'DET', 'KAN', 'LAC', 'SFO', 'DAL',
       'NOR', 'PHI', 'SEA', 'STL', 'WAS', 'CAR', 'BAL', 'JAC', 'MIN',
       'BUF', 'ATL', 'ARI', 'CHI', 'PIT', 'TEN'], dtype=object)

In [365]:
lines.dog.unique()

array([nan, 'SFO', 'MIA', 'IND', 'DEN', 'CHI', 'LAC', 'KAN', 'PIT', 'TEN',
       'OAK', 'NYJ', 'NYG', 'STL', 'SEA', 'ARI', 'GNB', 'CLE', 'HOU',
       'CIN', 'TAM', 'JAC', 'ATL', 'BUF', 'CAR', 'PHI'], dtype=object)

In [68]:
def simulate_games(concensus_lines, games, tol=.05, bet_size=100.):
    outcomes = []
    implied_probabilities = []
    for key, grp in concensus_lines.groupby('game_uuid'):
        line_taken = grp.loc[(grp.ml_fav_prob >= (1. / (grp.concensus_ml_fav - tol))) |
                             (grp.ml_dog_prob >= (1. / (grp.concensus_ml_dog - tol)))]
        if len(line_taken) == 0:
            continue
        else:
            line_taken = line_taken.iloc[0, :]
        
        if line_taken.ml_dog_z >= tol:
            team = abbrev_to_name(line_taken.dog)
            line = line_taken.ml_dog_line
            prob = line_taken.ml_dog_prob
        elif line_taken.ml_fav_z >= tol:
            team = abbrev_to_name(line_taken.fav)
            line = line_taken.ml_fav_line
            prob = line_taken.ml_fav_prob
        game = games.loc[games.uuid==key, :]
        won = ((game.home_team.values[0] == team and game.home_team_win.values[0] == 1) or
               (game.away_team.values[0] == team and game.home_team_win.values[0] == 0))
        
        implied_probabilities.append(prob)
        if won:
            outcomes.append(calc_payout(bet_size, line))
        else:
            outcomes.append(-bet_size)
    return outcomes, implied_probabilities

In [13]:
def process_games(games):
    games['away_team'] = games.away_team.apply(lambda x: x.split(' ')[-1].lower())
    games['home_team'] = games.home_team.apply(lambda x: x.split(' ')[-1].lower())
    games['date'] = games.date.apply(lambda x: datetime.datetime.strftime(pd.to_datetime(str(x)), '%m-%d-%y'))
    return games

In [17]:
def process_lines(lines):
    lines = lines.dropna().reset_index()
    lines['fav_spread_prob'] = lines['fav_spread_line'].apply(line_to_prob)
    lines['dog_spread_prob'] = lines['dog_spread_line'].apply(line_to_prob)
    lines['ml_dog_prob'] = lines['ml_dog_line'].apply(line_to_prob)
    lines['ml_fav_prob'] = lines['ml_fav_line'].apply(line_to_prob)
    lines['over_prob'] = lines['over'].apply(line_to_prob)
    lines['under_prob'] = lines['under'].apply(line_to_prob)
    return lines

In [14]:
games = pd.read_pickle('games.pkl')

In [15]:
games = process_games(games)

In [None]:
try:
    lines = pd.read_pickle('lines_last_250_games.pkl')
except:
    lines = extract_games_from_df(games.tail(250))
    pd.to_pickle(lines, 'lines_last_250_games.pkl')

http://www.vegasinsider.com/nfl/odds/offshore/line-movement/eagles-@-redskins.cfm/date/09-10-17




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


http://www.vegasinsider.com/nfl/odds/offshore/line-movement/ravens-@-bengals.cfm/date/09-10-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/steelers-@-browns.cfm/date/09-10-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/cardinals-@-lions.cfm/date/09-10-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/jaguars-@-texans.cfm/date/09-10-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/raiders-@-titans.cfm/date/09-10-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/falcons-@-bears.cfm/date/09-10-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/saints-@-vikings.cfm/date/09-11-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/chargers-@-broncos.cfm/date/09-11-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/texans-@-bengals.cfm/date/09-14-17
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/jets-@-raiders.cfm/date/09-17-17
http://www.vegasinsider.com/nfl/odds/offshore/lin

In [None]:
lines = process_lines(lines)

In [None]:
sns.distplot(lines['ml_fav_line'])

In [None]:
c_lines = calc_concensus(lines).dropna()

In [None]:
c_lines.loc[:, ['bookie', 'fav', 'dog', 'line_datetime', 'ml_dog_line',
                'ml_fav_line', 'concensus_ml_fav', 'concensus_ml_dog', 'ml_dog_z', 'ml_fav_z',
               'ml_dog_std', 'ml_fav_std']]

In [None]:
sns.distplot(c_lines['ml_dog_z'])

In [None]:
outcomes, probs = simulate_games(c_lines, games, tol=2)

In [None]:
len(outcomes)

In [None]:
plt.scatter(probs, outcomes)

In [None]:
fig, ax = plt.subplots(figsize=(17, 8))
ax.plot(range(len(outcomes)), np.cumsum(outcomes))
plt.savefig('results.png')

In [None]:
np.mean(outcomes)

In [None]:
np.mean(np.array(outcomes) > 0)

In [37]:
spread, line = data[5].get_text().strip().split(' ')

In [65]:
ret =extract_data_from_cell(data)

In [66]:
table = pd.DataFrame(columns=columns)

In [78]:
table = extract_data_from_table(table, headers[0])

In [93]:
game = extract_tables_from_game('patriots', 'titans', '01-13-18')

In [94]:
game

Unnamed: 0,bookie,date,dog,dog_spread_line,fav,fav_spread_line,ml_dog_line,ml_fav_line,over,spread,time,total,under,game_uuid
0,5DIMES LINE MOVEMENTS,01/07,TEN,-108.0,NWE,-102.0,700.0,-900,-108.0,13.5,6:54pm,47.0,-102.0,2b91589c-fd2b-11e7-9514-c4b301d13749
1,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,700.0,-900,-106.0,13.5,3:30am,47.0,-104.0,2b91589c-fd2b-11e7-9514-c4b301d13749
2,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,700.0,-900,-105.0,13.5,7:52am,47.0,-105.0,2b91589c-fd2b-11e7-9514-c4b301d13749
3,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,700.0,-900,-103.0,13.5,8:41am,47.0,-107.0,2b91589c-fd2b-11e7-9514-c4b301d13749
4,5DIMES LINE MOVEMENTS,01/08,TEN,-101.0,NWE,-109.0,700.0,-900,-103.0,13.0,12:18pm,47.0,-107.0,2b91589c-fd2b-11e7-9514-c4b301d13749
5,5DIMES LINE MOVEMENTS,01/08,TEN,-101.0,NWE,-109.0,660.0,-840,-103.0,13.0,12:56pm,47.0,-107.0,2b91589c-fd2b-11e7-9514-c4b301d13749
6,5DIMES LINE MOVEMENTS,01/09,TEN,-102.0,NWE,-108.0,660.0,-840,-102.0,13.0,3:30am,47.0,-108.0,2b91589c-fd2b-11e7-9514-c4b301d13749
7,5DIMES LINE MOVEMENTS,01/10,TEN,-105.0,NWE,-105.0,660.0,-840,-102.0,13.0,3:30am,47.0,-108.0,2b91589c-fd2b-11e7-9514-c4b301d13749
8,5DIMES LINE MOVEMENTS,01/10,TEN,-105.0,NWE,-105.0,660.0,-840,-102.0,13.0,8:42am,47.0,-108.0,2b91589c-fd2b-11e7-9514-c4b301d13749
9,5DIMES LINE MOVEMENTS,01/10,TEN,-105.0,NWE,-105.0,660.0,-840,-102.0,13.0,1:09pm,47.0,-108.0,2b91589c-fd2b-11e7-9514-c4b301d13749


In [190]:
test_games = pd.DataFrame(dict(home_team=['patriots', 'eagles'],
                               away_team=['titans', 'falcons'], date=['01-13-18', '01-13-18'],
                               uuid = [1, 2]))

In [191]:
test_games

Unnamed: 0,away_team,date,home_team,uuid
0,titans,01-13-18,patriots,1
1,falcons,01-13-18,eagles,2


In [196]:
games = extract_games_from_df(test_games)

http://www.vegasinsider.com/nfl/odds/offshore/line-movement/titans-@-patriots.cfm/date/01-13-18
http://www.vegasinsider.com/nfl/odds/offshore/line-movement/falcons-@-eagles.cfm/date/01-13-18


In [199]:
games.head()

Unnamed: 0,bookie,date,dog,dog_spread_line,fav,fav_spread_line,game_datetime,line_datetime,ml_dog_line,ml_fav_line,over,spread,time,total,under,uuid
0,5DIMES LINE MOVEMENTS,01/07,TEN,-108.0,NWE,-102.0,2018-01-13 20:15:00,2018-01-07 18:54:00,700.0,-900,-108.0,13.5,6:54pm,47.0,-102.0,1.0
1,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,2018-01-13 20:15:00,2018-01-08 03:30:00,700.0,-900,-106.0,13.5,3:30am,47.0,-104.0,1.0
2,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,2018-01-13 20:15:00,2018-01-08 07:52:00,700.0,-900,-105.0,13.5,7:52am,47.0,-105.0,1.0
3,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,2018-01-13 20:15:00,2018-01-08 08:41:00,700.0,-900,-103.0,13.5,8:41am,47.0,-107.0,1.0
4,5DIMES LINE MOVEMENTS,01/08,TEN,-101.0,NWE,-109.0,2018-01-13 20:15:00,2018-01-08 12:18:00,700.0,-900,-103.0,13.0,12:18pm,47.0,-107.0,1.0


In [112]:
games = pd.read_pickle('games.pkl')

In [116]:
test_games = games[games.season >= 2016]

In [118]:
lines = extract_games_from_df(test_games)

In [130]:
lines.head()

Unnamed: 0,bookie,date,dog,dog_spread_line,fav,fav_spread_line,game_uuid,ml_dog_line,ml_fav_line,over,spread,time,total,under
0,5DIMES LINE MOVEMENTS,01/07,TEN,-108.0,NWE,-102.0,7dbd792c-fd2e-11e7-b238-c4b301d13749,700.0,-900,-108.0,13.5,6:54pm,47.0,-102.0
1,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,7dbd792c-fd2e-11e7-b238-c4b301d13749,700.0,-900,-106.0,13.5,3:30am,47.0,-104.0
2,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,7dbd792c-fd2e-11e7-b238-c4b301d13749,700.0,-900,-105.0,13.5,7:52am,47.0,-105.0
3,5DIMES LINE MOVEMENTS,01/08,TEN,-108.0,NWE,-102.0,7dbd792c-fd2e-11e7-b238-c4b301d13749,700.0,-900,-103.0,13.5,8:41am,47.0,-107.0
4,5DIMES LINE MOVEMENTS,01/08,TEN,-101.0,NWE,-109.0,7dbd792c-fd2e-11e7-b238-c4b301d13749,700.0,-900,-103.0,13.0,12:18pm,47.0,-107.0


In [121]:
len(lines)

595456

In [122]:
date_1 = lines.date.values[0]
time_1 = lines.time.values[0]

In [132]:
datetime.datetime.strptime(date_1 + ' ' + time_1, '%m/%d %I:%M%p')

datetime.datetime(1900, 1, 7, 18, 54)