# PACKAGES IMPORTATION

In [12]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import concurrent.futures
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
import os
import re
import time

# FOLDER FOR DATA GATHERING

In [13]:
#working directories
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
DATA_DIR = os.path.join(BASE_DIR, 'get_data','data')

# URLS FOR SEASONS SCRAPED

In [14]:
# Creation of urls to scrape for all seasons
#range of seasons wished to scrape
seasons_wished = list(reversed(range(2008,2022)))

#complement of all urls
domain_name = 'https://www.oddsportal.com/'
file_path = 'basketball/usa/nba'
rest_of_url = '/results/'

#first url = current season
main_url = domain_name + file_path + rest_of_url

#all other season url
seasons_url = [domain_name + file_path + '-' + str(season) + '-' + str(season + 1) + rest_of_url for season in seasons_wished]

#complete url list to be scraped
all_url_seasons = [main_url] + seasons_url

#all_user_seasons

In [15]:
#function to check if variable is null
def is_empty(col):
    try:
        result = col.text
    except:
        result = None
    return result

# OPTIONS DOE WEBDRIVER

In [16]:
#beautiful soup and selenium objects
options = Options()

#headless mode
options.headless = True

#disable useful tools
options.add_argument("enable-automation")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")

#maximized UI to load totally the page
options.add_argument("start-maximized")

#loading options for the driver 
driver = webdriver.Chrome(options=options)

# FUNCTION TO SCRAPE DATA FROM UNIQUE EVENT

In [17]:
#scrape_data is a function which scrapes data event by event.
#here are features scraped : 
    #ft_hp = full-time home points
    #ft_ap = full-time away points
    #ftd_hp = full-time home points with extra time thanks to draw. 
    #ftd_ap = full-time away points with extra time thanks to draw. 
    #aver_home_odd = average home winning odd
    #aver_away_odd = average away winning odd
    #high_home_odd = highest home winning odd
    #high_away_odd = highest away winning odd
    
def scrape_data(url):
    global ftd_hp, ftd_ap
    #loading options for the driver 
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    #empty list for data scraped
    data = {} 
    
    #event-names
    event_names = soup.find('div', attrs = {'id' : 'col-content'})
    
    #full-time result 
    ft_result = soup.find(attrs = {'class' : 'result'}) 
    
    #extraction of average and highest data from soup
    aver = soup.find('tr', attrs = {'class' : 'aver'})
    high = soup.find('tr', attrs = {'class' : 'highest'})
    
    aver_home_odd, aver_away_odd = 0, 0
    if aver:
        #average home winning odd
        aver_home_odd = aver.find('td', attrs = {'class' : "right"}).text
        #average away winning odd
        aver_away_odd =  aver.find('td', attrs = {'class' : "right"}).findNext( attrs = {'class' : "right"}).text

    high_home_odd, high_away_odd = 0, 0
    if high:    
        #highest home winning odd
        high_home_odd = high.find('td', attrs = {'class' : "right"}).text
        #highest away winning odd
        high_away_odd = high.find('td', attrs = {'class' : "right"}).findNext( attrs = {'class' : "right"}).text

    ft_hp, ft_ap, ftd_hp, ftd_ap = 0, 0, 0, 0
    #if ft_result:
    if ft_result:
        ft_hp = re.findall('[0-9]+', ft_result.find('strong').text)[0]
        ft_ap = re.findall('[0-9]+', ft_result.find('strong').text)[1]
        if len(re.findall('[0-9]+', ft_result.find('strong').text)) > 2:
            ftd_hp = re.findall('[0-9]+', ft_result.find('strong').text)[2]
            ftd_ap = re.findall('[0-9]+', ft_result.find('strong').text)[3]
    
    data = {
    #event names    
    'event_name' : event_names.find('h1').text,
        
    #timestamp
    'timestamp' : event_names.find('p', attrs = {'class' : re.compile('date datet')}).text,
        
    #full-time home points
    'ft_hp' : ft_hp,
        
    #full-time away points
    'ft_ap' : ft_ap, 
        
    #full-time with draw home points
    ftd_hp' : ftd_hp,
        
    #full-time with draw away points
    ftd_ap' : ftd_ap,
        
    #average home winning odd
    aver_home_odd' : aver_home_odd,

    #average away winning odd
    aver_away_odd' : aver_away_odd,

    #highest home winning odd
    high_home_odd' : high_home_odd,
        
    #highest away winning odd
    high_away_odd' : high_away_odd
    }

    driver.quit()
    return data

SyntaxError: EOL while scanning string literal (622212451.py, line 71)

# Main script

In [None]:
# Here we scrape only one season
#url = all_url_seasons[3]
url = 'https://www.oddsportal.com/basketball/usa/nba-2008-2009/results/#/page/26/'
#loading options for the driver 
driver = webdriver.Chrome(options=options)
t1load = time.time()

#selenium and soup objects
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

t2load = time.time()
print(t2load - t1load)

event_data = []
i = 0
ii = 0

t1loop = time.time()

while True:
    i = i + 1
    print('page', i, 'started')
    #get urls for each events
    event_urls = []
    for col in soup.find_all('tr', attrs = {'deactivate'}):
        if col.find('td', attrs = {'center bold table-odds table-score'}).text != 'canc.':
            event_urls.append(domain_name+col.find('a').attrs['href'])
    #print(event_urls)
    
    #store previous or current page number
    previous_page = soup.find_all( 'span', attrs = {'class' : 'active-page'})[0].text
    
    for event_url in event_urls:
        ii = ii + 1
        #print(ii)
        event_data.append(scrape_data(event_url))
    
    #clicks on next page
    element = driver.find_element(By.PARTIAL_LINK_TEXT, '»')
    driver.execute_script("arguments[0].click();", element)

    #sleep so that the page can load properly
    time.sleep(2)

    #reload soup objects on new page
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    #get new page number
    new_page = soup.find_all('span', attrs = {'class' : 'active-page'})[0].text

    print('page', i, 'finished')
    #if there's no new pages left break
    if previous_page != new_page:
        continue
    else:
        break
      
    driver.quit()
    
t2loop = time.time()
#print(t2loop - t1loop)

16.42725133895874
page 1 started


In [None]:
df = pd.DataFrame(event_data)
df['country'] = 'usa'
df['sport'] = 'basketball'
df['league'] = 'nba'

#df

In [None]:
df.to_csv('usa_basketball_nba_'+url[46:55]+'.csv')