In [37]:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from datetime import date
import pickle

# BS4 Match Odds

In [38]:
url = 'https://www.bettingodds.com/football/premier-league'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
match_slugs = [a['href']  for a in soup.find_all('a', class_='oddsstats', href=True)]

# Selenium More Detailed Odds

In [39]:
driver = webdriver.Chrome('/usr/local/bin/chromedriver')

  driver = webdriver.Chrome('/usr/local/bin/chromedriver')


In [40]:
def close_popup_if_exists():
    try: 
        driver.find_element('class name', 'close-modal').click()
    except: 
        pass

In [41]:
def get_scoreline_odds(match_slug):
    '''Takes match_slug of the format /football/aston-villa-everton
    and returns dataframe of probabilities of the scoreline
    '''
    
    match_url = 'https://www.bettingodds.com' + match_slug
    driver.get(match_url)
    driver.execute_script("document.body.style_zoom='30%'")
    
    # Let page fully load
    time.sleep(.25)
    # Close Pop-up if Exists and Navigate to Correct Table
    close_popup_if_exists()
    time.sleep(2.5)
    
    # Get Names of Teams Playing Match and Position of Each Table on Page.
    team_names = [t.text for t in driver.find_elements('class name', 'team-name')]
    match_date = driver.find_element('class name', 'match-date').text
    print(f'{team_names}: {match_date}')
    tables_names = {tn.text: idx for idx, tn in enumerate(driver.find_elements('class name', 'mtb-header'))}
    score_idx = tables_names['Correct Score']
    
    # Click on the dropdown to expand the correct score block, and load the score data elements
    driver.execute_script("arguments[0].scrollIntoView();", driver.find_elements('class name', 'mtb-header')[score_idx])
    driver.find_elements('class name', 'mtb-header')[score_idx].click()
    score_data = driver.find_elements('class name', 'mtb-content')[score_idx]
    print('** found score data')
    
    # Scroll to the "view more" button and click it, so all data is available
    driver.execute_script("arguments[0].scrollIntoView();", score_data.find_element('class name', 'ot-view-more'))
    score_data.find_element('class name', 'ot-view-more').click()
    
    # Loop through each row of scoreline data to build table.
    print('** collecting score data')
    home_score = []
    away_score = []
    odds = []
    for row in score_data.find_elements('class name', 'results-row'):
        odds_grid = row.find_element('class name', 'grid-odds-list').find_elements('tag name', 'li')
        for odds_val in odds_grid:
            if odds_val.get_attribute('data-runner'):
                scoreline = odds_val.get_attribute('data-runner').split('-')
                home_score.append(scoreline[0])
                away_score.append(scoreline[1])
                odds.append(odds_val.get_attribute('data-decimal'))
                
    # Make Dataframe with Resulting Data
    print('** compiling score data')
    df = pd.DataFrame(zip(home_score, away_score, odds), columns=['home_score', 'away_score', 'odds'], dtype='float')
    df['home_team'] = team_names[0]
    df['away_team'] = team_names[1]
    df['match_date'] = match_date
    
    #odds of 0 don't make sense, remove
    df = df[df.odds > 0]
    
    df = (df
      .groupby(['home_team', 'away_team', 'match_date', 'home_score', 'away_score'])['odds']
      .mean()
      .reset_index()
    )
    
    print('** done')
    return df

In [42]:
scraped_scorelines = pd.DataFrame()

for slug in match_slugs:
    print(f'Working on {slug}')
    game_df = get_scoreline_odds(slug)
    scraped_scorelines = pd.concat([scraped_scorelines, game_df])

Working on /football/man-utd-southampton
['Southampton', 'Man Utd']: Sat 27 Aug
** found score data
** collecting score data
** compiling score data
** done
Working on /football/brentford-everton
['Brentford', 'Everton']: Sat 27 Aug
** found score data
** collecting score data
** compiling score data
** done
Working on /football/brighton-leeds
['Brighton', 'Leeds']: Sat 27 Aug
** found score data
** collecting score data
** compiling score data
** done
Working on /football/leicester-chelsea
['Chelsea', 'Leicester']: Sat 27 Aug
** found score data
** collecting score data
** compiling score data
** done
Working on /football/liverpool-bournemouth
['Liverpool', 'Bournemouth']: Sat 27 Aug
** found score data
** collecting score data
** compiling score data
** done
Working on /football/man-city-crystal-palace
['Man City', 'Crystal Palace']: Sat 27 Aug
** found score data
** collecting score data
** compiling score data
** done
Working on /football/arsenal-fulham
['Arsenal', 'Fulham']: Sat 2

KeyboardInterrupt: 

In [43]:
scraped_scorelines.head(5)

Unnamed: 0,home_team,away_team,match_date,home_score,away_score,odds
0,Southampton,Man Utd,Sat 27 Aug,0.0,0.0,15.5
1,Southampton,Man Utd,Sat 27 Aug,0.0,1.0,9.611111
2,Southampton,Man Utd,Sat 27 Aug,0.0,2.0,10.277778
3,Southampton,Man Utd,Sat 27 Aug,0.0,3.0,15.5
4,Southampton,Man Utd,Sat 27 Aug,0.0,4.0,28.375


In [44]:
scorelines_h = (scraped_scorelines[['home_team', 'home_score', 'away_score', 'match_date', 'odds']]
                       .rename(columns={'home_team': 'team', 'away_team': 'opp_team',
                                        'home_score': 'gf', 'away_score': 'ga'})
                       )

scorelines_a = (scraped_scorelines[['away_team', 'away_score', 'home_score', 'match_date', 'odds']]
                       .rename(columns={'away_team': 'team', 'home_team': 'opp_team',
                                        'away_score': 'gf', 'home_score': 'ga'})
                       )


In [45]:
scorelines_h['at_home'] = 1
scorelines_a['at_home'] = 0

all_scorelines = pd.concat([scorelines_h, scorelines_a])

# convert goals for and against to int
all_scorelines['gf'] = all_scorelines.gf.astype('int')
all_scorelines['ga'] = all_scorelines.ga.astype('int')

# Drop scorelines with over 5 goals
all_scorelines = all_scorelines[(all_scorelines.gf <= 5) & (all_scorelines.ga <= 5)]

# convert odds to implied probability
all_scorelines['proba'] = 1 / all_scorelines.odds

# Convert match_date column to datetime.
all_scorelines['match_date'] = pd.to_datetime(all_scorelines.match_date + [' 2022'], format='%a %d %b %Y')

# The bookies pad their odds, so the proabilities are over 100%. Readjust.
all_scorelines.proba = all_scorelines.proba / all_scorelines.groupby('team')['proba'].transform(sum)

In [46]:
all_scorelines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1008 entries, 0 to 55
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   team        1008 non-null   object        
 1   gf          1008 non-null   int64         
 2   ga          1008 non-null   int64         
 3   match_date  1008 non-null   datetime64[ns]
 4   odds        1008 non-null   float64       
 5   at_home     1008 non-null   int64         
 6   proba       1008 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 63.0+ KB


In [47]:
all_scorelines.team.unique()

array(['Southampton', 'Brentford', 'Brighton', 'Chelsea', 'Liverpool',
       'Man City', 'Arsenal', 'Aston Villa', 'Wolves', 'Nottm Forest',
       'Crystal Palace', 'Fulham', 'Leeds', 'Man Utd', 'Everton',
       'Leicester', 'Bournemouth', 'West Ham', 'Newcastle', 'Tottenham'],
      dtype=object)

In [48]:
all_scorelines.to_pickle('../data/scoreline_probabilities.pkl')

In [49]:
date_path = date.today().strftime('%Y_%m_%d')
path = f'../data/historical/{date_path}'

if not os.path.exists(path):
    os.makedirs(path)

# save to historical folder as well
all_scorelines.to_pickle(f'{path}/scoreline_probabilities.pkl')