In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import numpy as np
import pickle
from collections import OrderedDict as od

In [2]:
SPREAD_URL = 'https://classic.sportsbookreview.com/betting-odds/ncaa-basketball/?date='     #date in the form of 20141114
ML_URL = 'https://classic.sportsbookreview.com/betting-odds/ncaa-basketball/money-line/?date='

In [3]:
pinnacle_id = '238'
fiveDimes_id = '19'
bookMaker_id = '93'
betOnline_id = '1096'

In [4]:
def season_dates(start_date, end_date):
    """
    Formats dates into the format accepted by SBR
    Input should be in the form 'MM/DD/YYYY'
    """
    season = pd.date_range(start=start_date, end=end_date)
    date_range = []
    
    for dates in season:
        date = str(dates)
        for_date = str(date[:4]) + str(date[5:7]) + str(date[8:10]) 
        date_range.append(for_date)
    return date_range

In [5]:
def season_date_format(date):
    todays_date = pd.to_datetime(np.datetime64(date))
    return todays_date.strftime('%m/%d/%Y')
   
def sbr_date_format(date):
    desired_date = pd.to_datetime(np.datetime64(date))
    print(desired_date)
    return desired_date.strftime('%Y%m%d')

In [6]:
cbb14 = season_dates('11/14/2014', '04/06/2015')
cbb15 = season_dates('11/13/2015', '04/04/2016')
cbb16 = season_dates('11/11/2016', '04/03/2017')
cbb17 = season_dates('11/10/2017', '04/02/2018')
cbb18 = season_dates('11/16/2018', '04/08/2019')
cbb19 = season_dates('11/05/2019', '04/06/2020')

date_list = [cbb14, cbb15, cbb16, cbb17, cbb18, cbb19]

In [7]:
def page_db(date_list, URL=SPREAD_URL):
    """
    Collects the beautiful soup data and sorts into and sorts in a dictionary
    whose keys are the date ('YYYYMMDD') and values are the URLs
    URL parameter allows for changing betweet the url for gathering spreads (SPREAD_URL, default)
    and the url for gathering moneyline data(ML_URL)
    """
    todays_date = pd.to_datetime(np.datetime64('today'))
    date = todays_date.strftime('%Y%m%d')
    if todays_date > pd.to_datetime(date_list[-1]):
        pages = {'last_update': 'Complete', 'start_date' : date_list[0], 'end_date' : date_list[-1]}
    else:
        pages = {'last_update': todays_date, 'start_date' : date_list[0], 'end_date' : date_list[-1]}
        date_range = date_list[0 : date_list.index(date) + 1]
        
    pages_db = {}

    for page in date_range:
        url = URL + str(page)
        daily_page = requests.get(url)
        pages_db[str(page)] = daily_page
    if 'pages' not in pages:
        pages['pages'] = pages_db
    else:
        pages['pages'].update(page_db)

    return pages

In [None]:
#money14 = page_db(cbb14, URL=ML_URL)
#money15 = page_db(cbb15, URL=ML_URL)
#money16 = page_db(cbb16, URL=ML_URL)
#money17 = page_db(cbb17, URL=ML_URL)
#money18 = page_db(cbb18, URL=ML_URL)
#money19 = page_db(cbb19, URL=ML_URL)

In [226]:
#page_db14 = page_db(cbb14)
#page_db15 = page_db(cbb15)
#page_db16 = page_db(cbb16)
#page_db17 = page_db(cbb17)
#page_db18 = page_db(cbb18)
#page_db19 = page_db(cbb19)
page_db19

{'last_update': Timestamp('2020-02-05 00:00:00'),
 'start_date': '20191105',
 'end_date': '20200406',
 'pages': {'20191105': <Response [200]>,
  '20191106': <Response [200]>,
  '20191107': <Response [200]>,
  '20191108': <Response [200]>,
  '20191109': <Response [200]>,
  '20191110': <Response [200]>,
  '20191111': <Response [200]>,
  '20191112': <Response [200]>,
  '20191113': <Response [200]>,
  '20191114': <Response [200]>,
  '20191115': <Response [200]>,
  '20191116': <Response [200]>,
  '20191117': <Response [200]>,
  '20191118': <Response [200]>,
  '20191119': <Response [200]>,
  '20191120': <Response [200]>,
  '20191121': <Response [200]>,
  '20191122': <Response [200]>,
  '20191123': <Response [200]>,
  '20191124': <Response [200]>,
  '20191125': <Response [200]>,
  '20191126': <Response [200]>,
  '20191127': <Response [200]>,
  '20191128': <Response [200]>,
  '20191129': <Response [200]>,
  '20191130': <Response [200]>,
  '20191201': <Response [200]>,
  '20191202': <Response [

In [8]:
def pickle_db(db, bet_type='spread'):
    """
    Pickling dictionary of SBR page files with the naming convention
    'NCAAB' followed by the year. (exp: 'NCCAB_2019.csv')
    """    
    year = db['start_date'][0:4]

    if bet_type == 'moneyline':
        #csv_filename = "NCAAB_ML" + year + '.csv'
        pickle_filename = "NCAAB_ML" + year + '.pickle'
    else:
        #csv_filename = "NCAAB_" + year + '.csv'
        pickle_filename = "NCAAB_" + year + '.pickle'
        
    pickle_db = open(pickle_filename, 'wb')
    pickle.dump(db, pickle_db)
    pickle_db.close()
    return pickle_filename

In [46]:
#cbb_db_list = [page_db14, page_db15, page_db16, page_db17, page_db18, page_db19]
#pickled_spreads = []
#for file in cbb_db_list:
#    pickled_spreads.append(pickle_db(file))

In [None]:
#ml_db_list = [money14, money15, money16, money17, money18, money19]
#pickled_mls = []
#for file in ml_db_list:
#    pickled_mls.append(pickle_db(file, bet_type='moneyline'))

In [9]:
pickled_spreads = ['NCAAB_2014.pickle', 'NCAAB_2015.pickle', 'NCAAB_2016.pickle', 'NCAAB_2017.pickle', 'NCAAB_2018.pickle', 'NCAAB_2019.pickle']
pickled_mls = ['NCAAB_ML2014.pickle', 'NCAAB_ML2015.pickle', 'NCAAB_ML2016.pickle', 'NCAAB_ML2017.pickle', 'NCAAB_ML2018.pickle', 'NCAAB_ML2019.pickle']

In [10]:
def update_pickle(pickled_filenames, bet_type=None):
    """
    Checks to see if databases need to be updated and opens files back in to memory.
    """
    betType = bet_type
    memory_db = []
    for file in pickled_filenames:
        try:
            pickle_in = open(file, 'rb')
            open_file = pickle.load(pickle_in)
        except:
            print('{} already open'.format(str(file)))

        if open_file['last_update'] != 'Complete':
            date_range = season_dates(open_file['last_update'],
                                      season_date_format('today')) #creates date range from last update until current date
            updated_page_db = page_db(date_range)                  #pulls missing pages from web, returns new dictionary 
            if open_file['last_update'] == open_file['end_date']:
                  del open_file['pages'][open_file['last_update']]              
            open_file['pages'].update(updated_page_db['pages'])
            open_file['last_update'] = updated_page_db['last_update']
            open_file = pickle_db(open_file, bet_type=betType)
            print('Updated:', open_file)
            updated_open = open(open_file, 'rb')
            updated_load = pickle.load(updated_open)
            memory_db.append(updated_load)
        else:
            memory_db.append(open_file)
            
    return memory_db

In [11]:
pickled_spread = update_pickle(pickled_spreads, bet_type='spread')

Updated: NCAAB_2019.pickle


In [12]:
pickled_mls = update_pickle(pickled_mls, bet_type='moneyline')

Updated: NCAAB_ML2019.pickle


In [208]:
def parse_line(line):
    """
    Takes a line of HTML from SBR and parses out the spread and odds.
    line.text in form of '+12½ -106'
    It then returns a list containing the game_id, team_id, spread, and odds.
    """
    game_id = re.findall('(\d{5,7})', line.attrs['id'])
    team_id = re.findall('(\d{3,4}).[1]$', line.attrs['id'])
    game_line = line.text
    if len(line.text) < 1:
        game_line = None
    else:
        game_line = game_line.replace(u'\xa0', u' ')
        if '½' in game_line:
            game_line = game_line.replace('½', '.5')
        if 'PK' in game_line:
            bet_info = re.findall('([-+][\d]*)', game_line)
            spread = 'PK'
            odds = bet_info[0]
        else:            
            bet_info = re.findall('([^\s]+)', game_line)
            spread = bet_info[0]
            try:
                odds = bet_info[1]
                #odds = re.findall('\s(.*)', game_line)
            except:
                odds = None
        return [game_id[0], team_id[0], spread, odds]    
    return [game_id[0], team_id[0], game_line]

In [14]:
def line_grabber(soup):
    """
    Takes a SBR soup object as input and returns a tuple of bs4 
    class tags sorted by Sportsbook.
    """
    pinnacle, fiveDimes, betOnline, bookMaker, opener = {'book' : 'Pinnacle'}, {'book' : 'FiveDimes'}, {'book' : 'BetOnline'},\
                                                        {'book' : 'BookMaker'}, {'book' : 'Openers'}
    bookline = soup.find_all(class_='eventLine-book-value')
    for line in bookline:
        if line.has_attr('id'):
            game_data = parse_line(line)
            game_id = game_data.pop(0)
            if 'eventLineOpener' in str(line) and game_id not in opener:           
                opener[game_id] = [game_data] 
            elif 'eventLineOpener' in str(line) and game_id in opener:
                opener[game_id].append(game_data)
            elif pinnacle_id in str(line) and game_id not in pinnacle:
                pinnacle[game_id] = [game_data]
            elif pinnacle_id in str(line) and game_id in pinnacle:
                pinnacle[game_id].append(game_data)
            elif fiveDimes_id in str(line) and game_id not in fiveDimes:
                fiveDimes[game_id] = [game_data]
            elif fiveDimes_id in str(line) and game_id in fiveDimes:
                fiveDimes[game_id].append(game_data)
            elif bookMaker_id in str(line) and game_id not in bookMaker:
                bookMaker[game_id] = [game_data]
            elif bookMaker_id in str(line) and game_id in bookMaker:
                bookMaker[game_id].append(game_data)
            elif betOnline_id in str(line) and game_id not in betOnline:
                betOnline[game_id] = [game_data]
            elif betOnline_id in str(line) and game_id in betOnline:
                betOnline[game_id].append(game_data)

    return (opener, pinnacle, fiveDimes, bookMaker, betOnline)

In [330]:
def ml_line_grabber(ml_soup):
    pinnacle, fiveDimes, betOnline, bookMaker, opener = {'book' : 'Pinnacle'}, {'book' : 'FiveDimes'}, {'book' : 'BetOnline'},\
                                                        {'book' : 'BookMaker'}, {'book' : 'Openers'}
    data = []
    bookline = ml_soup.find_all(class_='eventLine-book-value')
    for line in bookline:
        try:
            info_string = line.attrs['id']            
        except:
            continue

        odds_line = re.findall('([-|+][\d]*)', line.text)
#            if bet_type == 'spread':            
#                spread = odds_line[0]
#                odds = odds_line[1]
#            elif bet_type == 'moneyline':
        if len(odds_line) > 0:
            ml_odds = odds_line[0]
        else:
            ml_odds = None
        if '999996' in info_string or'169' in info_string or'180' in info_string or '139' in info_string or '1275' in info_string or '999991' in info_string:
            continue
        if pinnacle_id in info_string or bookMaker_id in info_string or fiveDimes_id in info_string or betOnline_id in info_string:
            pass
        else:
            continue
        game_info= re.findall('-(\d*)', info_string)
        game_id = game_info[0]
        sb_id = game_info[1]
        if 'eventLineOpener' in str(line):
            opener = True
        else:
            opener = False
        team_id = game_info[2]
        data.append([game_id, team_id, sb_id, ml_odds, opener])
    return data

In [249]:
def boxscore(soup, game_id):
    """
    Scrapes the boxscore for the games.
    Returns a list with 2 list, home and away.
    """
    scores = soup.find(class_='score', id='score-' + game_id)
    contents = scores.contents
    
    home = contents[0]    
    periods_home = home.findAll(class_='period')
    final_home_first = (home.findAll(class_='first total'))
    final_home = (home.findAll(class_='total'))[0]
    
    boxscore_home = [periods_home[game].text for game in range(0, len(periods_home))]
    
    if len(final_home_first) >= 1:
        final_home_first = final_home_first[0].text
        boxscore_home.append(final_home_first)
    elif len(final_home) >= 1:
        final_home = final_home[0].text
        boxscore_home.append(final_home)
    else:
        boxscore_home.append(np.nan)
    for idx in range(0, len(boxscore_home)):
        if boxscore_home[idx] == '':
            boxscore_home[idx] = np.nan
        elif boxscore_home[idx] == u'\xa0':
            boxscore_home[idx] = None

    away = contents[1]
    periods_away = away.findAll(class_='period')
    final_away_first = (away.findAll(class_='first total'))
    final_away = (away.findAll(class_='total'))
    
    boxscore_away = [periods_away[game].text for game in range(0, len(periods_away))]
    if len(final_away_first) >= 1:
        final_away_first = final_away_first[0].text
        boxscore_away.append(final_away_first)
    elif len(final_away) >= 1:
        final_away = final_away[0].text
        boxscore_away.append(final_away)
    else:
        boxscore_away.append(np.nan)
    for idx in range(0, len(boxscore_away)):
        if boxscore_away[idx] == '':
            boxscore_away[idx] = np.nan
        elif boxscore_away[idx] == u'\xa0':
            boxscore_away[idx] = None
    
    return [boxscore_home, boxscore_away]   

In [251]:
def json_data(soup):
    """
    Takes a URL address from SBR, creates a BeautifulSoup object and then proceeds
    to parse the JSON data from within.  
    The data is returned in the form of a dictionary of list, whose keys are the 
    game ids used by the website.  The dict values are list for each game that
    also contain 3 list, home, away, general info.
    """
    js = soup.select("[type='application/ld+json']")
    
    data = {}
    for game in js:
        game_text = json.loads(game.text)        
        if game_text['name'] == " vs ":
            continue
        else:
            team_ids = game_text['name']
            if '  ' in team_ids:
                team_ids = team_ids.replace('  ', ' & ')
            if 'AM' in team_ids:
                team_ids = team_ids.replace('AM', 'A&M')
            home_id = (re.findall('.+?(?= vs)', team_ids))[0]
            away_id = (re.findall('(?=vs).*', team_ids))[0][3:]
           
        game_id = game_text['@id']
        url = game_text['url']
        date_time = game_text['startDate']
        date = date_time[:10]
        start_time = date_time[11:]
        
        arena = game_text['location']['name']
        if 'addressLocality' in game_text['location']['address']:
            venue_city = game_text['location']['address']['addressLocality']
        else:
            venue_city = np.nan
        if 'addressRegion' in game_text['location']['address']:
            venue_state = game_text['location']['address']['addressRegion']
        else:
            venue_state = np.nan
            
        home = {'name' : home_id}
        away = {'name' : away_id}
        info = {'game_id' : game_id, 'teams' : team_ids, 'url' : url, 'arena' : arena, 'city' : venue_city, 
                'state' : venue_state, 'date' : date, 'time' : start_time, 'date_time' : date_time}
        data[game_id] = {'home':home, 'away': away, 'info': info}
    
    for game_id, info_dicts in data.items():
        boxscores = boxscore(soup, game_id)
        info_dicts['home']['home_score'] = boxscores[0]
        info_dicts['away']['away_score'] = boxscores[1]
    
    return data

In [198]:
def match_id_name(soup):
    """
    Retrieves and matches Team ID numbers with Team Names.
    Returns a dictionary with ID# as key, team name as value.
    """
    id_name_list = {}
    name_tag = soup.findAll(class_='team-name')
    for data in name_tag:
        a_tag = data.findAll('a', href=True)[0]
        href = a_tag['href']
        team_name = data.text
        team_name = team_name.replace(u'\xa0', u'')
        try:
            if '(' in team_name or ')' in team_name:
                try:
                    rank = re.findall('^[(]([\d]*)[)]', team_name)[0]
                    team_name = re.findall('([A-Z].*[a-z]*)', team_name)[0]
                    if '  ' in team_name:
                        print('found match')
                except:
                    rank = None
            else:
                rank = None
            team_id = data.attrs['rel']
            id_name_list[team_name] = [team_id, href]
        except:
            print (team_name)
            continue
        if rank != None and len(rank) > 0:
            id_name_list[team_name].append(rank)
        else:
            id_name_list[team_name].append(None)
             
    return id_name_list

In [19]:
def pair_info_json(oJson, name_to_id):    
    for game, info in oJson.items():
        url = info['info']['url']
        mascot_home = info['home']['name']
        mascot_away = info['away']['name']
        for team, team_id in name_to_id.items():
            if team_id[1] == url:
                if team in mascot_home:            
                    info['home']['home_name'] = team
                    info['home']['home_id'] = team_id[0]
                    info['home']['rank'] = team_id[2]
                if team in mascot_away:
                    info['away']['away_name'] = team
                    info['away']['away_id'] = team_id[0]
                    info['away']['rank'] = team_id[2] 
    return oJson

In [334]:
def ml_spread_combiner(money_line_grabber_result, line_grabber_dict, json_dict):
    """
    Uses the helper dictionary sb_key in order to combine the money lines for games with spreads.
    """

    #print(line_grabber_dict)
    for date, games in money_line_grabber_result.items():
        for game in games:
            sb_key = {'openers' : ['opener', line_grabber_dict[date][0]],
                      pinnacle_id : ['Pinnacle', line_grabber_dict[date][1]],
                      fiveDimes_id : ['Five Dimes', line_grabber_dict[date][2]],
                      bookMaker_id : ['BookMaker', line_grabber_dict[date][3]],
                      betOnline_id : ['BetOnline', line_grabber_dict[date][4]]}
            game_id = game[0]
            sbid = game[2]
            team_id = game[1]
            ml_odds = game[3]
            _open = game[4]
            if _open == True:
                sbid = 'openers'
            json_game = json_dict[date][game_id]
                       
            for book, data in sb_key.items():    
                try:
                    spreads = data[1][game_id]
                except: 
                    bookie = data[0]
                    if team_id == json_game['home']['home_id']:
                        json_dict[date][game_id]['home']['spread'] = {bookie : None}
                    elif team_id == json_game['away']['away_id']:
                        json_dict[date][game_id]['away']['spread'] = {bookie : None}
                    continue
                    
                bookie = data[0]
                for spread in spreads:
                    if team_id == spread[0]:
                        team_spread = spread[1:]
                        if team_id == json_game['home']['home_id']:
                            if 'money_line' not in json_dict[date][game_id]['home']:
                                json_dict[date][game_id]['home']['money_line'] = {sb_key[sbid][0] : ml_odds}
                            else:
                                json_dict[date][game_id]['home']['money_line'].update({sb_key[sbid][0] : ml_odds})
                            if 'spread' not in json_dict[date][game_id]['home']:
                                json_dict[date][game_id]['home']['spread'] = {bookie : team_spread}
                            else:
                                json_dict[date][game_id]['home']['spread'].update({bookie : team_spread})
                        elif team_id == json_game['away']['away_id']:
                            if 'money_line' not in json_dict[date][game_id]['away']:
                                json_dict[date][game_id]['away']['money_line'] = {sb_key[sbid][0] : ml_odds}
                            else:
                                json_dict[date][game_id]['away']['money_line'].update({sb_key[sbid][0] : ml_odds})
                            if 'spread' not in json_dict[date][game_id]['away']:
                                json_dict[date][game_id]['away']['spread'] = {bookie : team_spread}
                            else:
                                json_dict[date][game_id]['away']['spread'].update({bookie : team_spread})
    return json_dict

In [333]:
 def clean_soup(spread_page_range, ml_page_range):
    """
    Takes as input a dictionary with key values of dates ('YYYYMMDD') and
    values of BeautifulSoup data.
    """
    spread_data = {}
    json_dict = {}
    for date, page in spread_page_range.items():
        soup = BeautifulSoup(page.content, 'html.parser')
        oJson = json_data(soup) # oJson[game_id][home/away/info][...]
        name_to_id = match_id_name(soup) # team_name : team_id
        updated_ojson = pair_info_json(oJson, name_to_id)        
        game_spreads = line_grabber(soup)
        spread_data[date] = game_spreads
        json_dict[date] = updated_ojson

    ml_data = {}
    for date, page in ml_page_range.items():
        soup = BeautifulSoup(page.content, 'html.parser')
        money_lines = ml_line_grabber(soup)
        ml_data[date] = money_lines       
    
    results = ml_spread_combiner(ml_data, spread_data, json_dict) 
    
    return results

In [85]:
page = {'20141114': requests.get(SPREAD_URL+str(cbb14[0]))}
mlpage = {'20141114' : requests.get(ML_URL + str(cbb14[0]))}

In [47]:
soup = BeautifulSoup(page.content, 'html.parser')
mlsoup = BeautifulSoup(mlpage.content, 'html.parser')

In [48]:
game_spreads = line_grabber(soup)
money_lines = ml_line_grabber(mlsoup)
#(opener, pinnacle, fiveDimes, bookMaker, betOnline)

In [331]:
cs = clean_soup(page, mlpage)