Here are all the functions I use to gather and clean the data I will use in the model.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import pandas as pd

chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 50)

In [3]:
def get_seconds(time):
    mm, ss = str(time).split(":")
    
    return int(mm) * 60 + int(ss)

In [4]:
def sort_words(s1, s2):
    words = [s1, s2]
    words.sort()
    
    return words[0] + words[1]

In [1]:
def clean_possessions(poss_df, team):
    counter = 0
    new_df = []
    poss_df.iloc[:, 6] = team
    # Create GameId
    # Change start and end from time remaining to time elapsed in quarter
    # Create column for time remaining in the game
    for i in range(len(poss_df)):
        poss_df.iloc[i, 7] = str(poss_df.iloc[i, 0]) + sort_words(str(poss_df.iloc[i, 6]), str(poss_df.iloc[i, 1]))
        
        if poss_df.iloc[i, 1] == 'CHH':
            poss_df.iloc[i, 1] = 'CHA'
        elif poss_df.iloc[i, 1] == 'NJN':
            poss_df.iloc[i, 1] = 'BKN'
        elif poss_df.iloc[i, 1] == 'VAN':
            poss_df.iloc[i, 1] = 'MEM'
        elif poss_df.iloc[i, 1] == 'SEA':
            poss_df.iloc[i, 1] = 'OKC'
        
        if poss_df.iloc[i, 2] == 1:
            poss_df.iloc[i, 8] = get_seconds(poss_df.iloc[i, 5]) + 2160
        elif poss_df.iloc[i, 2] == 2:
            poss_df.iloc[i, 8] = get_seconds(poss_df.iloc[i, 5]) + 1440
        elif poss_df.iloc[i, 2] == 3:
            poss_df.iloc[i, 8] = get_seconds(poss_df.iloc[i, 5]) + 720
        elif poss_df.iloc[i, 2] == 4:
            poss_df.iloc[i, 8] = get_seconds(poss_df.iloc[i, 5])
        elif poss_df.iloc[i, 2] == 5:
            poss_df.iloc[i, 8] = get_seconds(poss_df.iloc[i, 5])
        
    poss_df.iloc[:, 0] = pd.to_datetime(poss_df.iloc[:, 0])
    # Group by Date
    poss_groups = poss_df.groupby(poss_df.columns[0])
    # Sort by time remaining
    for key, item in poss_groups:
        temp = pd.DataFrame(item)
        new_df.append(temp.sort_values(temp.columns[8], ascending=False))
        
            
    cleaned_poss = pd.concat(new_df, axis=0)
    cleaned_poss.drop([cleaned_poss.columns[9], cleaned_poss.columns[10], cleaned_poss.columns[11], 
                       cleaned_poss.columns[12], cleaned_poss.columns[13], cleaned_poss.columns[14], 
                       cleaned_poss.columns[15]], axis=1, inplace=True)
    cleaned_poss.reset_index(inplace=True, drop=True)
    
    return cleaned_poss

In [6]:
def get_scoring_possessions(season, team_id, team): 
    quarters = ["1", "2", "3", "4", "5"]
    time_range = [[720, 600], [599, 480], [479, 360], [359, 240], [239, 120], [119, 0]]
    possession_arrows = ["1", "2"]
    all_poss = []
    # service = Service(executable_path='/Users/User/Desktop/Programming/chromedriver')
    
    # The possessions have to be broken down like this because the table will only display a maximum of 500 elements
    for quarter in quarters:
        # I have to start a new browser otherwise Google Chrome crashes midway into getting 2nd quarter data.
        driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.get("https://www.pbpstats.com/possession-finder/nba?TeamId="+str(team_id)+"&Season="+str(season)+"&SeasonType=Regular%2BSeason&OffDef=Offense&StartType=All")
        driver.maximize_window()
        # Choose each quarter
        driver.find_element(By.XPATH, "/html/body/div/div/main/div[1]/div[5]/div/div[1]").click()
        driver.find_element(By.XPATH, "/html/body/div/div/main/div[1]/div[5]/div/div[3]/ul/li["
                            +quarter+"]").click()
        for possession_arrow in possession_arrows:
            # Choose offense or defense
            driver.find_element(By.XPATH, "/html/body/div/div/main/div[1]/div[4]/div/div[1]").click()
            driver.implicitly_wait(3)
            driver.find_element(By.XPATH, "/html/body/div/div/main/div[1]/div[4]/div/div[3]/ul/li["+
                                possession_arrow+"]/span").click()
            
            for i in time_range:
                if quarter == "5":
                    if i[0] == 720:
                        # Retrieve possessions table
                        try:
                            driver.find_element(By.XPATH, "/html/body/div/div/main/div[7]/button").click()
                        except NoSuchElementException:
                            break
                        
                        # Wait
                        WebDriverWait(driver, 25).until(
                            EC.presence_of_element_located((By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select")))
                        # Scroll to bottom
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        # Select All
                        driver.find_element(By.XPATH, "//html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select/option[6]").click()
                        # Get possessions data
                        table = driver.find_elements(By.ID, 'vgt-table')
                        possessions = pd.read_html(table[2].get_attribute('outerHTML'))
                        possessions = possessions[0].dropna(axis=0, thresh=4)
                        off_def = [int(possession_arrow)] * len(possessions)
                        possessions = possessions.assign(OffenseDefense=off_def)
                        # Scroll to top
                        driver.execute_script("window.scrollTo(0, 0);")
                    else:
                        break
                    
                else:
                    if i[0] == 720:
                        # This inner if/else needs to be here because when the website first loads, it loads the \
                        # first possession table, which contains the entire first quarter.
                        # This causes the first dataframe that is appended to the list to be the wrong one.
                        try:
                            WebDriverWait(driver, 25).until(
                                EC.presence_of_element_located((By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select")))
                        except TimeoutException:
                            driver.find_element(By.XPATH, "/html/body/div/div/main/div[7]/button").click()
                            WebDriverWait(driver, 25).until(
                                EC.presence_of_element_located((By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select")))
                            
                        # Input beginning of possession start range
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[1]").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[1]").clear()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[1]").send_keys(str(i[0]))
                        # Input end of possession start range
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[2]").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[2]").clear()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[2]").send_keys(str(i[1]))
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[7]/button").click()
                        WebDriverWait(driver, 25).until(
                        EC.presence_of_element_located((By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select")))
                        # Scroll to bottom
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        # Select All
                        driver.find_element(By.XPATH, "//html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select/option[6]").click()
                        # Get possessions data
                        table = driver.find_elements(By.ID, 'vgt-table')
                        possessions = pd.read_html(table[2].get_attribute('outerHTML'))
                        possessions = possessions[0].dropna(axis=0, thresh=4)
                        off_def = [int(possession_arrow)] * len(possessions)
                        possessions = possessions.assign(OffenseDefense=off_def)
                        # Scroll to top
                        driver.execute_script("window.scrollTo(0, 0);")
                    else:
                        # Input beginning of possession start range
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[1]").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[1]").clear()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[1]").send_keys(str(i[0]))
                        # Input end of possession start range
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[2]").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[2]").clear()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[3]/input[2]").send_keys(str(i[1]))
                        # Retrieve possessions table
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[7]/button").click()
                    
                        # Wait
                        WebDriverWait(driver, 25).until(
                            EC.presence_of_element_located((By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select")))
                        # Scroll to bottom
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        # Select All
                        driver.find_element(By.XPATH, "//html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select").click()
                        driver.find_element(By.XPATH, "/html/body/div/div/main/div[9]/div/div/div[3]/div[1]/form/select/option[6]").click()
                        # Get possessions data
                        table = driver.find_elements(By.ID, 'vgt-table')
                        possessions = pd.read_html(table[2].get_attribute('outerHTML'))
                        possessions = possessions[0].dropna(axis=0, thresh=4)
                        off_def = [int(possession_arrow)] * len(possessions)
                        possessions = possessions.assign(OffenseDefense=off_def)
                        # Scroll to top
                        driver.execute_script("window.scrollTo(0, 0);")
                    
                all_poss.append(possessions)
                poss_df = pd.concat(all_poss, axis=0)
            
        
    cleaned_poss = clean_possessions(poss_df, team)
        
    return cleaned_poss

In [7]:
def games_matrix(data):
    games = data.groupby(data.columns[7])
    games_list = []
    
    for key, item in games:
        games_list.append(item)
        
    games_list = list(map(lambda x: x[[x.columns[3], x.columns[8]]].to_numpy(), games_list))

In [1]:
def get_season(year, team_ids, teams):
    teams_stats_list = []
    for index, team_id in enumerate(team_ids):
        try:
            teams_stats_list.append(get_scoring_possessions(year, team_id, teams[index]))
        except TimeoutException:
            continue
    
    return teams_stats_list

In [3]:
def split_by_team(model_data, teams):
    to_return = []
    
    for idx, team in enumerate(teams):
        to_return.append(model_data[model_data["team_id"] == team])
        
    return to_return