## Data Retrieval

In [1]:
# import statements

import os
import io
import pandas as pd
import numpy as np
import time
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta

import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup

import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials

In [2]:
# stathead login info

username = '*******'
password = '*******'

### Functions

In [3]:
# accepts betting pros cookies

def accept_cookies():
    
    try:
        element = driver.find_element("id","onetrust-accept-btn-handler")
        driver.execute_script("arguments[0].click();", element)
    except:
        pass

In [9]:
# find number of pages of lines
def find_num_pages():
    try:
        xpath = "//span[@class = 'pbcs-page-pagination__text']"
        page_num_info = driver.find_element(by=By.XPATH, value=xpath).text.split()
        num_pages = int(page_num_info[3])
        return num_pages
    except:
        time.sleep(1)
        return find_num_pages()

# get all player props for given day 
def get_props():
    
    num_pages = find_num_pages()
    
    props = {}
    
    for page_num in range(1,num_pages+1):
    
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        rows = soup.find_all("div", {"id":"primary-info-container"})

        while (len(rows)==0) and (not soup.find(text="There are no player props available at this time.")):
            time.sleep(2)
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            rows = soup.find_all("div", {"id":"primary-info-container"})

        for row in rows:        
            s = str(row)
            soup = BeautifulSoup(s, "html.parser")
            name = soup.find_all("a", {"class":"pbcs__player-link"})[0].get_text().strip()
            line = float(soup.find_all("span", {"class":""})[2].get_text())
            props[name] = line
        
        # go to next page
        if page_num != num_pages:
            xpath = "//button[@class = 'pbcs-page-pagination__button button button--tertiary']"
            next_page_button = driver.find_elements_by_xpath(xpath)[1]
            next_page_button.click()
    
    return props

In [5]:
# login to stathead

def login_stathead(username, password):
    
    username_box = driver.find_element("id","username")
    username_box.send_keys(username)

    password_box = driver.find_element("id","password")
    password_box.send_keys(password)

    login_button = driver.find_element("id","sh-login-button")
    driver.execute_script("arguments[0].click();", login_button)

In [6]:
# goes to next page of data, return True if next page exists, False otherwise

def go_to_next_page():  
    try:
        xpath = "//a[@class = 'button2 next']"
        next_page = driver.find_element(by=By.XPATH, value=xpath)
        driver.execute_script("arguments[0].click();", next_page)
        return True
    except Exception as e:
        return False

### Scraping BettingPros

In [7]:
# get all urls for the past year 

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = datetime.strptime("2022-10-18", "%Y-%m-%d").date() # first day of 2022-2023 NBA season
end_date = date.today()

urls = []
for single_date in daterange(start_date, end_date):
    url = 'https://www.bettingpros.com/nba/picks/prop-bets/bet/points?date={}'.format(single_date.strftime("%Y-%m-%d"))
    urls.append(url)

In [10]:
path_to_chromedriver = os.getcwd() + "/chromedriver"
driver = webdriver.Chrome(executable_path=path_to_chromedriver)

betting_df = pd.DataFrame(columns=['Name', 'Line', 'Date'])
for url in urls:   

    driver.get(url)
    accept_cookies()
    props = get_props()

    df = pd.DataFrame(props.items(), columns=['Name', 'Line'])
    df['Date'] = datetime.strptime(url[-10:], "%Y-%m-%d").date()
    betting_df = betting_df.append(df, ignore_index=True)
    
driver.close()

  driver = webdriver.Chrome(executable_path=path_to_chromedriver)
  next_page_button = driver.find_elements_by_xpath(xpath)[1]


### Scraping Stathead Player Game Data

In [11]:
url = 'https://stathead.com/users/login.cgi?redirect_uri=https%3A//stathead.com/basketball/'
path_to_chromedriver = os.getcwd() + "/chromedriver"
driver = webdriver.Chrome(executable_path=path_to_chromedriver)
driver.get(url)

# login
login_stathead(username, password)

# get to Player Game Finder 2022-2023
url = 'https://stathead.com/basketball/player-game-finder.cgi?request=1&match=player_game&order_by_asc=0&order_by=pts&year_min=2023&year_max=2023&comp_type=reg&comp_id=NBA&season_start=1&season_end=-1'
driver.get(url)

columns = "Rk,Player,PTS,Date,Age,Team,Unnamed: 6,Opp,Result,GS,MP,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,3P%,FT,FTA,FT%,TS%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS.1,GmSc,BPM,Pos.,Player-additional"
player_game_data_df = pd.DataFrame(columns=columns.split(","))

next_page_exists = True
while(next_page_exists):
    
        # get table as csv
        a = ActionChains(driver)
        xpath = "//li[@class = 'hasmore']"
        dropdown = driver.find_element(by=By.XPATH, value=xpath)
        a.move_to_element(dropdown).perform()
        xpath = "//button[@tip = 'Export table as <br>suitable for use with Excel']"
        button = driver.find_element(by=By.XPATH, value=xpath)
        a.move_to_element(button).click().perform()

        # get csv as DataFrame
        data = driver.find_element("id","csv_stats").text
        data = data[data.index('Rk,Player'):]
        df = pd.read_csv(io.StringIO(data), sep=",")

        # add data to cummulative DataFrame
        player_game_data_df = player_game_data_df.append(df, ignore_index=True)

        # go to next page
        next_page_exists = go_to_next_page()

driver.close()

player_game_data_df = player_game_data_df.rename(columns={"Unnamed: 6": "Home_Away"})

  driver = webdriver.Chrome(executable_path=path_to_chromedriver)


### Scraping Stathead Team Game Data

In [12]:
url = 'https://stathead.com/users/login.cgi?redirect_uri=https%3A//stathead.com/basketball/'
path_to_chromedriver = os.getcwd() + "/chromedriver"
driver = webdriver.Chrome(executable_path=path_to_chromedriver)
driver.get(url)

# login
login_stathead(username, password)

# get to Team Game Finder 2022-2023
url = 'https://stathead.com/basketball/team-game-finder.cgi?request=1&match=team_game&order_by_asc=0&order_by=pts&year_min=2023&year_max=2023&comp_type=reg&team_seed_comp=%3D&opp_seed_comp=%3D&best_of=A&comp_id=NBA'
driver.get(url)

columns = "Rk,Team,Date,PTS,Unnamed: 4,Opp,Result,MP,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,3P%,FT,FTA,FT%,PTS.1,FG.1,FGA.1,FG%.1,2P.1,2PA.1,2P%.1,3P.1,3PA.1,3P%.1,FT.1,FTA.1,FT%.1,PTS.2"
team_game_data_df = pd.DataFrame(columns=columns.split(","))

next_page_exists = True
while(next_page_exists):  
    
    # get table as csv
    a = ActionChains(driver)
    xpath = "//li[@class = 'hasmore']"
    dropdown = driver.find_element(by=By.XPATH, value=xpath)
    a.move_to_element(dropdown).perform()
    xpath = "//button[@tip = 'Export table as <br>suitable for use with Excel']"
    button = driver.find_element(by=By.XPATH, value=xpath)
    a.move_to_element(button).click().perform()

    # get csv as DataFrame
    data = driver.find_element("id","csv_stats").text
    data = data[data.index('Rk,Team'):]
    df = pd.read_csv(io.StringIO(data), sep=",")
    
    # add data to cummulative DataFrame
    team_game_data_df = team_game_data_df.append(df, ignore_index=True)

    # go to next page
    next_page_exists = go_to_next_page()

driver.close()

team_game_data_df = team_game_data_df.rename(columns={"Unnamed: 4": "Home_Away"})

  driver = webdriver.Chrome(executable_path=path_to_chromedriver)


### DataFrame to Google Sheets

In [13]:
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
json_file_name = os.getcwd() + "/sportsbetting-376321-f2ada03a7020.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(json_file_name, scope)
gc = gspread.authorize(credentials)
spreadsheet_key = '1qZfM3myJ9naCc_AT7U20Zjv5UPMEa20sSTVuG7HlNNc'

In [14]:
wks_name = 'BettingPros'
d2g.upload(betting_df, spreadsheet_key, wks_name, credentials=credentials, row_names=True)
betting_df.head(5)

Unnamed: 0,Name,Line,Date
0,Jayson Tatum,22.5,2022-10-18
1,Stephen Curry,23.5,2022-10-18
2,De'Anthony Melton,7.5,2022-10-18
3,Anthony Davis,24.5,2022-10-18
4,Andrew Wiggins,15.5,2022-10-18


In [15]:
wks_name = 'PlayerData'
d2g.upload(player_game_data_df, spreadsheet_key, wks_name, credentials=credentials, row_names=True)
player_game_data_df.head(5)

Unnamed: 0,Rk,Player,PTS,Date,Age,Team,Home_Away,Opp,Result,GS,...,AST,STL,BLK,TOV,PF,PTS.1,GmSc,BPM,Pos.,Player-additional
0,1,Donovan Mitchell,71,2023-01-02,26-117,CLE,,CHI,W 145-134 (OT),1,...,11,0,1,4,3,71,60.8,22.8,G,mitchdo01
1,2,Luka Dončić,60,2022-12-27,23-302,DAL,,NYK,W 126-121 (OT),1,...,10,2,1,4,5,60,56.3,25.0,G-F,doncilu01
2,3,Damian Lillard,60,2023-01-25,32-194,POR,,UTA,W 134-124,1,...,8,3,0,4,1,60,54.0,25.6,G,lillada01
3,4,Joel Embiid,59,2022-11-13,28-242,PHI,,UTA,W 105-98,1,...,8,1,7,5,3,59,54.4,35.2,C,embiijo01
4,5,Devin Booker,58,2022-12-17,26-048,PHO,,NOP,W 118-114,1,...,5,0,0,2,3,58,42.4,16.3,G,bookede01


In [16]:
wks_name = 'TeamData'
d2g.upload(team_game_data_df, spreadsheet_key, wks_name, credentials=credentials, row_names=True)
team_game_data_df.head(5)

Unnamed: 0,Rk,Team,Date,PTS,Home_Away,Opp,Result,MP,FG,FGA,...,2P.1,2PA.1,2P%.1,3P.1,3PA.1,3P%.1,FT.1,FTA.1,FT%.1,PTS.2
0,1,SAC,2022-11-15,153,,BRK,W 153-121,240,56,94,...,29,53,0.547,11,28,0.393,30,33,0.909,121
1,2,MIL,2023-01-23,150,@,DET,W 150-130,240,55,98,...,30,55,0.545,13,32,0.406,31,40,0.775,130
2,3,OKC,2023-01-03,150,,BOS,W 150-117,240,58,98,...,28,48,0.583,12,40,0.3,25,31,0.806,117
3,4,MIN,2022-12-18,150,,CHI,W 150-126,240,57,87,...,30,50,0.6,16,38,0.421,18,24,0.75,126
4,5,POR,2023-01-23,147,,SAS,W 147-127,240,55,92,...,34,63,0.54,13,27,0.481,20,26,0.769,127
