In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import date
import os
import time
import pickle

# BS4 Match Odds

In [2]:
url = 'https://www.bettingodds.com/football/premier-league'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
match_slugs = [a['href']  for a in soup.find_all('a', class_='oddsstats', href=True)]

# Selenium More Detailed Odds

In [3]:
driver = webdriver.Chrome('/usr/local/bin/chromedriver')

  driver = webdriver.Chrome('/usr/local/bin/chromedriver')


In [4]:
def close_popup_if_exists():
    try: 
        driver.find_element('class name', 'close-modal').click()
    except: 
        pass

In [5]:
def close_cookies_wrapper_if_exists():
    try:
        driver.find_element('class name', 'accept-cross-button cookie-button-link').click()
    except:
        pass

In [6]:
def get_match_info(match_slug):
    match_url = 'https://www.bettingodds.com' + match_slug
    driver.get(match_url)
    #driver.execute_script("document.body.style.zoom='30%'")
    
    # Let page fully load
    time.sleep(.25)

    # Close Pop-up if Exists and Navigate to Correct Table
    close_popup_if_exists()
    close_cookies_wrapper_if_exists()
    
    #After Closing Pop-up, again give time for page load
    time.sleep(2)
    
    # Get Names of Teams Playing Match and Position of Each Table on Page.
    team_names = [t.text for t in driver.find_elements('class name', 'team-name')]
    match_date = driver.find_element('class name', 'match-date').text
    print(f'{team_names}: {match_date}')
    
    tables_names = {tn.text: idx for idx, tn in enumerate(driver.find_elements('class name', 'mtb-header'))}
    
    return driver, tables_names, match_date

In [7]:
def get_stats_odds(driver, tables_names, match_date, stat):
    '''Takes a selenium webdriver
    and returns dataframe of probabilities of the scoreline.
    Expects stat = 'Anytime Assist' or 'Anytime Goal Scorer'
    '''
    
    driver.execute_script("window.scrollTo(0, 150)")
    try:
      idx = tables_names[stat]
    except:
      print(f'Couldnt find a table for {stat}. Skipping this match.')
      return
    
    # Click on the dropdown to expand the correct score block, and load the score data elements
    driver.execute_script("arguments[0].scrollIntoView();", driver.find_elements('class name', 'mtb-header')[idx])
    driver.find_elements('class name', 'mtb-header')[idx].click()
    stat_data = driver.find_elements('class name', 'mtb-content')[idx]
    print('** found stat data')
    
    # Scroll to the "view more" button and click it, so all data is available
    driver.execute_script("arguments[0].scrollIntoView();", score_data.find_element('class name', 'ot-view-more'))
    stat_data.find_element('class name', 'ot-view-more').click()
    
    # Loop through each row of scoreline data to build table.
    print('** collecting stat data')
    player = []
    odds = []
    for row in stat_data.find_elements('class name', 'results-row'):
        odds_grid = row.find_element('class name', 'grid-odds-list').find_elements('tag name', 'li')
        for odds_val in odds_grid:
            if odds_val.get_attribute('data-runner'):
                player.append(odds_val.get_attribute('data-runner'))
                odds.append(odds_val.get_attribute('data-decimal'))
                
    # Make Dataframe with Resulting Data
    stat_col = 'odds_' + stat.replace('Anytime ', '').lower().replace(' ', '_')
    df = pd.DataFrame(zip(player, odds), columns=['player', stat_col])
    
    #odds of 0 don't make sense, remove
    df[stat_col] = df[stat_col].astype('float')
    df = df[df[stat_col] > 0]
    
    aggs = ['mean', 'min', 'max', 'count']
    
    df = (df
      .groupby(['player'])[stat_col]
      .agg(aggs)
      .reset_index()
      .rename(columns={a: f'{stat_col}_{a}' for a in aggs})
    )
    
    df['match_date'] = match_date
    
    print('** done')
    return df

In [10]:
all_assists = pd.DataFrame()
all_goals = pd.DataFrame()

for slug in match_slugs:
    print(f'Beginning scraping data for {slug}')
    driver, tables_names, match_date = get_match_info(slug)
    print(f'Beginning scraping assists for {slug}')
    game_df_assists = get_stats_odds(driver, tables_names, match_date, 'Anytime Assist')
    print(f'Beginning scraping goals for {slug}')
    game_df_goals = get_stats_odds(driver, tables_names, match_date,'Anytime Goalscorer')
    
    all_assists = pd.concat([all_assists, game_df_assists])
    all_goals = pd.concat([all_goals, game_df_goals])

driver.quit() # quit the webdriver

Beginning scraping data for /football/everton-liverpool
['Everton', 'Liverpool']: Sat 3 Sep
Beginning scraping assists for /football/everton-liverpool
** found stat data


ElementClickInterceptedException: Message: element click intercepted: Element <div class="ot-view-more">...</div> is not clickable at point (1121, 1726). Other element would receive the click: <img src="/media/images/admin/site-takeover-banners/sports-football-992x110-ukenglish.gif">
  (Session info: chrome=105.0.5195.102)
Stacktrace:
0   chromedriver                        0x000000010fc17788 chromedriver + 4515720
1   chromedriver                        0x000000010fb9b9d3 chromedriver + 4008403
2   chromedriver                        0x000000010f82e12a chromedriver + 413994
3   chromedriver                        0x000000010f86d1b2 chromedriver + 672178
4   chromedriver                        0x000000010f86ac27 chromedriver + 662567
5   chromedriver                        0x000000010f868234 chromedriver + 651828
6   chromedriver                        0x000000010f866f57 chromedriver + 646999
7   chromedriver                        0x000000010f85a861 chromedriver + 596065
8   chromedriver                        0x000000010f8827d2 chromedriver + 759762
9   chromedriver                        0x000000010f85a075 chromedriver + 594037
10  chromedriver                        0x000000010f88292e chromedriver + 760110
11  chromedriver                        0x000000010f895bd9 chromedriver + 838617
12  chromedriver                        0x000000010f882603 chromedriver + 759299
13  chromedriver                        0x000000010f858990 chromedriver + 588176
14  chromedriver                        0x000000010f859a75 chromedriver + 592501
15  chromedriver                        0x000000010fbe76cd chromedriver + 4318925
16  chromedriver                        0x000000010fbebf35 chromedriver + 4337461
17  chromedriver                        0x000000010fbf31ff chromedriver + 4366847
18  chromedriver                        0x000000010fbecc5a chromedriver + 4340826
19  chromedriver                        0x000000010fbc2c2c chromedriver + 4168748
20  chromedriver                        0x000000010fc094f8 chromedriver + 4457720
21  chromedriver                        0x000000010fc09693 chromedriver + 4458131
22  chromedriver                        0x000000010fc1ea3e chromedriver + 4545086
23  libsystem_pthread.dylib             0x00007ff803aef4e1 _pthread_start + 125
24  libsystem_pthread.dylib             0x00007ff803aeaf6b thread_start + 15


In [None]:
df = pd.merge(all_assists, all_goals, how='outer', on=['player', 'match_date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 465 entries, 0 to 464
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   player                 465 non-null    object 
 1   odds_assist_mean       398 non-null    float64
 2   odds_assist_min        398 non-null    float64
 3   odds_assist_max        398 non-null    float64
 4   odds_assist_count      398 non-null    float64
 5   match_date             465 non-null    object 
 6   odds_goalscorer_mean   465 non-null    float64
 7   odds_goalscorer_min    465 non-null    float64
 8   odds_goalscorer_max    465 non-null    float64
 9   odds_goalscorer_count  465 non-null    int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 40.0+ KB


In [None]:
# Convert match date to datetime
df['match_date'] = pd.to_datetime(df.match_date + [' 2022'], format='%a %d %b %Y')

In [None]:
df['proba_assist'] = 1 / df.odds_assist_mean
df['proba_goal'] = 1 / df.odds_goalscorer_mean

In [None]:
date_path = date.today().strftime('%Y_%m_%d')
path = f'../data/historical/{date_path}'

if not os.path.exists(path):
    os.makedirs(path)
    
# save to historical folder as well
df.to_pickle(f'{path}/goals_assists_odds.pkl')

In [None]:
assists_odds_min = 1
goalscorer_odds_min = 5

# odds that are only from a few bookies don't look as trustworthy -- remove those below the mins

df = df[(df.odds_assist_count > assists_odds_min) & (df.odds_goalscorer_count > goalscorer_odds_min)]

In [None]:
df.sort_values('odds_goalscorer_mean').head(25)

Unnamed: 0,player,odds_assist_mean,odds_assist_min,odds_assist_max,odds_assist_count,match_date,odds_goalscorer_mean,odds_goalscorer_min,odds_goalscorer_max,odds_goalscorer_count,proba_assist,proba_goal
201,Erling Braut Haaland,3.5,3.5,3.5,2.0,2022-08-27,1.44026,1.36364,1.57,8,0.285714,0.694319
185,Mohamed Salah,3.633333,2.4,6.0,3.0,2022-08-27,1.55254,1.5,1.72,9,0.275229,0.644106
245,Gabriel Jesus,3.75,3.75,3.75,2.0,2022-08-27,1.701454,1.53,2.0,9,0.266667,0.587732
213,Julian Alvarez,3.5,3.5,3.5,2.0,2022-08-27,1.727777,1.53,2.2,9,0.285714,0.578779
164,Diogo Jota,3.05,2.45,3.4,3.0,2022-08-27,1.754934,1.7,1.8,7,0.327869,0.569822
370,Harry Kane,3.666667,3.6,3.8,3.0,2022-08-28,1.865047,1.8,2.0,9,0.272727,0.53618
9,Cristiano Ronaldo,5.25,4.5,5.5,4.0,2022-08-27,1.879885,1.8,2.1,8,0.190476,0.531947
189,Roberto Firmino,2.883333,2.875,2.9,3.0,2022-08-27,1.930629,1.75,2.2,9,0.346821,0.517966
241,Eddie Nketiah,4.4,4.2,4.5,3.0,2022-08-27,1.987407,1.75,2.2,9,0.227273,0.503168
394,Son Heung-min,3.033333,3.0,3.1,3.0,2022-08-28,2.104814,1.8,2.6,9,0.32967,0.475101


In [None]:
df.to_pickle('../data/goals_assists_odds.pkl')

In [9]:
import sys
sys.path.append('/Users/andrew.peters/Documents/FPL/')
from odds_model.utils import betting_odds_scraper

ImportError: cannot import name 'betting_odds_scraper' from 'odds_model.utils' (unknown location)

In [None]:
odds_model/utils/betting_odds_scaper.py