## Webdriver Setup

In [5]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup

import numpy as np

pd.set_option('display.max_columns', None)

In [6]:
driver = webdriver.Chrome()

In [7]:
driver.get("https://uaasports.info/calendar.aspx?path=wbball")

In [8]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [9]:
tables = soup.find_all("table", {"class" : "sidearm-calendar-table"})

In [10]:
table = tables[0]

td = table.find("tbody").find("tr").find_all("td")[-1]

In [11]:
def my_function(table):
    td = table.find("tbody").find("tr").find_all("td")[-1]

    # if there's no span tag
    if len( td.find_all("span") ) == 0:
        return ""
    else:
        a = td.find("span").find("a")
        
        return "https://uaasports.info" + a['href']

In [12]:
links = [my_function(t) for t in tables]
links = [i for i in links if (i != "")]

## Player Box Score (Demo)

In [13]:
link = links[0]

In [14]:
driver.get(link)

In [18]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [20]:
table = soup.find("div", {"id" : "DataTables_Table_0_wrapper"}).find("table")

In [21]:
# get column headers
th_objects = table.find("thead").find_all("th")
headers = [th.text for th in th_objects]

In [22]:
# get table data
table_rows = table.find("tbody").find_all('tr')
table_data = [[td.text for td in tr.find_all("td")] for tr in table_rows]

In [23]:
# get player names
player_names = [tr.find("th").text for tr in table_rows]

In [24]:
# turn data into dataframe
import pandas as pd

df = pd.DataFrame(table_data, columns=[i for i in headers if (i != "Player")])
df = df.iloc[:-1, :]

df['Player'] = player_names[:-1]
df['Player'] = df.Player.apply(lambda i : i.split(" ")[1])
df['Player'] = df.Player.apply(lambda i : i.split(",")[1] + " " + i.split(",")[0])

df

Unnamed: 0,##,GS,MIN,FG,3PT,FT,ORB-DRB,REB,PF,A,TO,BLK,STL,PTS,Player
0,10,*,32,5-15,0-1,3-3,1-0,1,0,2,5,0,3,13,Emily Lauer
1,15,*,33,4-17,0-4,3-6,4-8,12,1,0,3,2,0,11,Sarah Santicola
2,14,*,33,3-13,2-5,0-0,0-3,3,0,3,1,0,3,8,Jess Bickart
3,5,*,31,2-6,1-3,2-2,1-2,3,2,0,2,0,0,7,Sofia Genareo
4,25,*,17,0-1,0-1,0-0,0-1,1,2,0,0,1,0,0,Jenna Lutz
5,1,,15,1-4,0-1,0-0,0-1,1,2,1,0,0,0,2,Jasmyn Fisher
6,4,,13,0-0,0-0,2-2,0-2,2,1,3,0,0,1,2,Branygan Bianchin
7,2,,18,0-3,0-2,0-2,0-1,1,4,1,2,0,0,0,Analyse Rios
8,20,,5,0-1,0-1,0-0,0-2,2,0,0,0,0,0,0,Sydney Watko
9,11,,3,0-0,0-0,0-0,0-0,0,0,0,0,0,0,0,Elly Gray


## Play-by-Play (Demo)

In [26]:
section_plays = soup.find("section", {"id" : "play-by-play"})

In [27]:
period_divs = section_plays.find_all("div")

In [28]:
def pbp_to_df(indvidual_period_div, quarter):

    table_rows = indvidual_period_div.find("table").find("tbody").find_all('tr')
    
    table_data = [[td.text for td in tr.find_all("td")] for tr in table_rows]

    df = pd.DataFrame(table_data, 
                      columns=['Score', 'AwayAction', 'AwayScore', 'HomeScore', 'HomeAction', 'Score2', 'Action'])

    df['Period'] = quarter
    df['Time'] = [tr.find("th").text for tr in table_rows]

    return df

In [29]:
period_dfs = [pbp_to_df(div, i+1) for i, div in enumerate(period_divs)]

plays = pd.concat(period_dfs)

In [35]:
plays = plays.replace("--", np.nan)

plays['Time'] = plays.Time.ffill()

plays.loc[0, 'Score'] = "placeholder 0-0"

plays = plays.replace("", np.nan)

plays['Score'] = plays.Score.ffill()
plays['Score'] = plays['Score'].apply(lambda str : str.split(" ")[1])

In [43]:
plays = plays[['Time', 'AwayAction', 'Score', 'HomeAction', 'Period']]

plays

Unnamed: 0,Time,AwayAction,Score,HomeAction,Period
0,09:39,"MISS JUMPER by BICKART,JESS",placeholder 0-0,,1
1,09:39,"REBOUND OFF by SANTICOLA,SARAH",0-0,,1
2,09:34,"MISS LAYUP by SANTICOLA,SARAH(in the paint)",0-0,,1
3,09:34,,0-0,"BLOCK by ARIAS,AVA",1
4,09:34,,0-0,"REBOUND DEF by GIBBS,SAVANNAH",1
...,...,...,...,...,...
109,00:53,,43-77,"REBOUND DEF by PROUTY,ANNA",4
110,00:32,,43-80,"GOOD 3PTR by DELMORE,KAYLA",4
111,00:32,,43-80,"ASSIST by HEYER,ALLIE",4
112,00:08,"MISS 3PTR by WATKO,SYDNEY",43-80,,4


### Play-by-Play Data Manipulation (Part 1)

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
h3_tags = soup.find("section", {"id" : "box-score"}).find_all("h3")

away_team = " ".join( h3_tags[1].text.split(" ")[:-1] )
home_team = " ".join( h3_tags[2].text.split(" ")[:-1] )

plays['AwayTeam'] = away_team
plays['HomeTeam'] = home_team

In [62]:
date = soup.find("section", {"id" : "box-score"}).find("header").find("dd").text

def clean_date(string):
    split_string = string.split("/")

    year = split_string[2]
    month = split_string[0] if (len(split_string[0]) == 2) else "0" + split_string[0]
    day = split_string[1] if (len(split_string[1]) == 2) else "0" + split_string[1]

    return year + "-" + month + "-" + day

plays['Date'] = clean_date(date)

### Play-by-Play Data Manipulation (Part 2)
Want to add the following columns:
- Conference
- Substitution
- Seconds

In [69]:
uaa_teams = ['Chicago', 'Rochester', 'Carnegie Mellon', 'Case Western', 'Emory', 'NYU', 'Brandeis', 'WashU']

plays['Conference'] = (plays.AwayTeam.isin(uaa_teams) & plays.HomeTeam.isin(uaa_teams))

In [70]:
plays['Substitution'] = (plays.AwayAction.str.contains("SUB") | plays.HomeAction.str.contains("SUB"))

In [73]:
def time_to_seconds(league, period, time):
    if league == "WBB":
        reg = 4
        ot_start = 5
        p_length = 10
    else:
        reg = 2
        p_length = 20
        ot_start = 3

    if period >= ot_start:
        passed = reg*p_length*60 + 5*60*(period - ot_start)
    else:
        passed = p_length*60*(period - 1)
      
    time_min = int(time.split(':')[0])*60
    time_sec = int(time.split(':')[1])
      
    time_passed = p_length*60 - (time_min + time_sec)
      
    return passed + time_passed

plays['League'] = "WBB"
plays['Seconds'] = plays.apply(lambda row : time_to_seconds(row.League, row.Period, row.Time), axis=1)

### Play-by-Play Data Manipulation (Part 3)
- Pseudocode for lineup parser

In [None]:
# list_of_lineups = [starters]
    # list_of_lineups is as a list of lists, of length 1, where the 'starters' object above is a list of 5 player names

# for each row in the play-by-play:
#     if the current play is a substitution play:
#         take the most recent lineup - the last entry in list_of_lineups
#         add the player that's subbing in - at this point, you have a list of 6 player names
#         remove the player that's subbing out - now, you should have a list of 5 player names
#         append the updated lineup - list of 5 player names - to list_of_lineups
#
#         if, after this step, the updated lineup does NOT have 5 player names due to an error on the statistician's part,
#         you will have to manually replace your updated lineup with the accurate lineup either
#         you can do this by either watching game film or checking the subsequent plays to see which players are in (not always reliable)
#     else - i.e. current play is NOT a substitution play:
#         add the last entry of list_of_lineups
#
#         (the length of list_of_lineups should increase by 1 after every single play, even if a substitution did not take place)