In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

import string
import datetime
import import_ipynb
import NBAInjuryAnalysis

importing Jupyter notebook from NBAInjuryAnalysis.ipynb


In [2]:
# global variables
%store -r playersToIds
%store -r idsToPlayers
%store -r datesDict
%store -r statsNBAMap

In [3]:
###### WHO HAS A PLAYER PLAYED WITH, AND FOR HOW LONG? #########
def findTotalSharedTimes(playerId):
    url = getMainPlayerURL(playerId)
    
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")
    
    sharedTimes = {}
    individualYears = {}
    
    seenYears = set()
    lineups = soup.find(id='inner_nav').findAll('a')
    for row in lineups:
        suffix = row['href']
        if 'lineups' in suffix:
            if suffix in seenYears: # repeats in HTML
                break
            seenYears.add(suffix)
            
            yearUrl = "https://www.basketball-reference.com" + suffix
#             print("loading JavaScript: ", yearUrl)
            yearTeammates = findSharedCourtTimes(playerId, yearUrl)
            
            slash = suffix.rindex('/')
            year = suffix[slash+1:]
            season = str(int(year)-1) + "-" + year[2:]
            
            for teammate in yearTeammates:
                timeStr = yearTeammates[teammate]
                colon = timeStr.index(':')
                timeShared = datetime.timedelta(seconds=60*int(timeStr[:colon]) + int(timeStr[colon+1:]))
                yearTeammates[teammate] = timeShared
                if teammate in sharedTimes:
                    sharedTimes[teammate] += timeShared
                else:
                    sharedTimes[teammate] = timeShared
            individualYears[season] = yearTeammates
                
    return sharedTimes, individualYears

def findSharedCourtTimes(playerId, url):
    options = setupHeadlessChrome()
    
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)
    sharedTimes = {}
    
    html = driver.page_source
    if "div_lineups-2-man" not in html: # possible only played in playoffs - John Holland
        return sharedTimes
    twoMan = driver.find_element_by_id("div_lineups-2-man")
    table = twoMan.find_elements_by_tag_name('tr')
    
    totalRows = len(table)
    for index in range(2, totalRows-1):
        row = table[index]
        elems = row.find_elements_by_tag_name('td')
        players = elems[0].get_attribute('csk')
        minutes = elems[2].get_attribute('csk')

        colon = players.index(":")
        firstTeammate = players[:colon]
        secondTeammate = players[colon+1:]

        teammate = firstTeammate if playerId == secondTeammate else secondTeammate
        sharedTimes[idsToPlayers[teammate]] = minutes
    return sharedTimes

################# Find all dates that a player has been traded, including in-season trades #######################
def getTrades(playerId):
    url = getMainPlayerURL(playerId)
    
    options = setupHeadlessChrome()
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)
    
    html = driver.page_source
    if 'div_transactions' not in html: # basketball reference automated no transaction - Pero Antić
        return [], [] 
    transactions = driver.find_element_by_id("div_transactions")

    transList = transactions.find_elements_by_tag_name('span')
    tradeDates = []
    inSeasonTradeDates = []

    for transaction in transList:
        if transaction.get_attribute('class') != ' nba-transactions':
            continue
        text = transaction.get_attribute('innerText')
        colon = text.index(':')
        dateStr = text[:colon]
        description = text[colon+1:]
        if 'trade' in description or 'Trade' in description: # not yet strong enough, need to also add when
            #traded before draft occurred
            
            date = datetime.datetime.strptime(dateStr, '%B %d, %Y')
            tradeDates.append(date)
            potentialYears = getPotentialTradeYears(date.year)
        
            outOfSeason = True
            if potentialYears[0] in datesDict:
                firstDates = datesDict[potentialYears[0]]
                firstStart = datetime.datetime.strptime(firstDates[0], '%a, %b %d, %Y')
                firstEnd = datetime.datetime.strptime(firstDates[1], '%a, %b %d, %Y')
                if date >= firstStart and date <= firstEnd:
                    outOfSeason = False
            
            if potentialYears[1] in datesDict:
                lastDates = datesDict[potentialYears[1]]
                lastStart = datetime.datetime.strptime(lastDates[0], '%a, %b %d, %Y')
                lastEnd = datetime.datetime.strptime(lastDates[1], '%a, %b %d, %Y')
                if date >= lastStart and date <= lastEnd:
                    outOfSeason = False
                    
            if not outOfSeason:
                inSeasonTradeDates.append(date)
            
    return tradeDates, inSeasonTradeDates
    
def getPotentialTradeYears(year):
    options = []
    options.append(str(year-1) + '-' + str(year)[2:])
    options.append(str(year) + '-' + str((year+1))[2:])
    if options[1] == "2020-21":
        options[1] = "2019-20"
    return options

def midseasonTradeTeams(trades, teams):
    # for simplicity, it may be easier to only consider players traded once per season
    # let's also assume that a player is not waived and then picked up by another
    # team and then traded
    tradesToTeams = {}
    for trade in trades:
        year = getYear(trade)
        if year not in teams: #occurs when team trades player before they actually draft player - Malik Beasley
            continue
        teamsForYear = teams[year]
        if len(teamsForYear) == 2:
            tradesToTeams[trade] = [teamsForYear[0], teamsForYear[1]]
    return tradesToTeams
        
def analyzeMidseasonTrades(trades):
    ratings = {}
    for trade, teams in trades.items():
        year = getYear(trade)
        firstDate = trade - datetime.timedelta(days=14)
        firstTeamId = statsNBAMap[teams[0]]
        
        dash = year.index('-')
        yearDate = year[:dash]
        if int(yearDate) < 1996: # stats.nba limitation
            return ratings
        
        # note - midseason trades don't happen in playoffs
        url = "https://stats.nba.com/team/{}" \
            + "/advanced/?Season={}&SeasonType=Regular%20" \
            + "Season&DateFrom={}%2F{}%2F{}&DateTo={}%2F{}%2F{}"

        url = url.format(firstTeamId, year, firstDate.month, firstDate.day, firstDate.year,
                        trade.month, trade.day, trade.year)

        beforeNetrtg = NBAInjuryAnalysis.getAbsentNetRtg(url)
        
        secondDate = trade + datetime.timedelta(days=14)
        secondTeamId = statsNBAMap[teams[1]]
        url = "https://stats.nba.com/team/{}" \
            + "/advanced/?Season={}&SeasonType=Regular%20" \
            + "Season&DateFrom={}%2F{}%2F{}&DateTo={}%2F{}%2F{}"

        url = url.format(secondTeamId, year, trade.month, trade.day, trade.year,
                        secondDate.month, secondDate.day, secondDate.year)

        afterNetrtg = NBAInjuryAnalysis.getAbsentNetRtg(url)
        
        ratings[trade] = (beforeNetrtg, afterNetrtg)
    return ratings

######## FIND ALL TEAMS AN NBA PLAYER HAS PLAYED FOR ##########
def getTeams(playerId):
    url = getMainPlayerURL(playerId)
    
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")
    seasons = soup.findAll('tr')
    teams = {}
    for i in range(1, len(seasons)):
        season = seasons[i]
        if season.th:
            if season.th.getText() == "Career": # career table starts
                break
        else: # did not play this season
            continue
        year = season.find('a').getText()
        teamName = season.findAll('td')[1].getText()
        if teamName != "TOT": # traded in the middle of a season
            if year not in teams:
                teams[year] = [teamName]
            else:
                soFar = teams[year]
                soFar.append(teamName)
                teams[year] = soFar
    return teams

######## FIND FRACTION OF POSITIONS PLAYER HAS PLAYED IN CAREER ##########
def findPositionRatios(playerId):
    url = getMainPlayerURL(playerId)

    options = setupHeadlessChrome()
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)

    html = driver.page_source
    
    pbps = driver.find_elements_by_id("div_pbp")
    if len(pbps) == 0:
        return {}
    pbp = pbps[0]
    table = pbp.find_elements_by_tag_name('tr')

    ratios = {}
    for i in range(2, len(table)): # first two rows set up table
        season = table[i]
        stats = season.find_elements_by_tag_name('td')
        if stats[0].text == "": # stats[0] is age, so an empty string means "Career" row
            ratio = getYearlyRatio(season, stats)
            ratios['Career'] = ratio
            return ratios
        else:
            ratio = getYearlyRatio(season, stats)
            year = season.find_elements_by_tag_name('a')[0].text
            ratios[year] = ratio
    return ratios

def getYearlyRatio(season, stats):
    ratio = {}
    for row in stats:
        stat = row.get_attribute('data-stat')
        if 'pct' in stat:
            ratio[stat[-1]] = float(row.text.strip('%'))/100 if row.text != "" else 0.0
    return ratio
            
        
################# Find +/- stats for a player's individual playing years, and career ######################
def getPlusMinus(playerId):
    url = getMainPlayerURL(playerId)
    options = setupHeadlessChrome()
    
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)
    
    html = driver.page_source
    pbps = driver.find_elements_by_id("div_pbp")
    
    if len(pbps) == 0:
        return {}
    pbp = pbps[0]
    
    table = pbp.find_elements_by_tag_name('tr')

    plusMinusDict = {}
    for i in range(2, len(table)): # first two rows set up table
        season = table[i]
        stats = season.find_elements_by_tag_name('td')
        
        year = season.find_element_by_tag_name('a').text if stats[0].text != "" else "Career"
        onCourt = stats[11].get_attribute('innerText')
        onOff = stats[12].get_attribute('innerText')
        plusMinusDict[year] = [onCourt, onOff]
        
        if stats[0].text == "":
            return plusMinusDict

############### For a player, find all streaks that player has gone on, for both points and assists #################
def findStreaks(playerId):
    url = getMainPlayerURL(playerId)
    
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")
    seasons = soup.findAll('tr')
    
    ptsStreakDict = {}
    astStreakDict = {}
    for i in range(1, len(seasons)):
        season = seasons[i]
        if season.th:
            if season.th.getText() == "Career": # career table starts
                break
        else: # did not play this season
            continue
        yearStr = season['id']
        colon = yearStr.index('.')
        year = yearStr[colon+1:]
        teamId = season.findAll('a')[1].text
        teamId = streakFindingTeam(teamId)
        
        if teamId == 'NBA':
            continue
        
        yearFull = str(int(year)-1) + "-" + year[2:]
        
        ptsStreaks = getStreakData(playerId, year, teamId, True, 20) # streaks over 20 points
        ptsStreakDict[yearFull] = ptsStreaks
        
        astStreaks = getStreakData(playerId, year, teamId, False, 5) # streaks over 5 assists
        astStreakDict[yearFull] = astStreaks
    return ptsStreakDict, astStreakDict
        
def getStreakData(playerId, year, teamId, isPts, value):
    url = "https://www.basketball-reference.com/play-index/pstreak.cgi?" \
        + "request=1&year_min={}&year_max={}&team_id={}&" \
        + "season_start=1&season_end=-1&c1stat={}&c1comp=ge&" \
        + "c1val={}"
    url = url.format(year, year, teamId, 'pts' if isPts else 'ast', value)
#     print(url)
    
    html = urlopen(url)
    soup = BeautifulSoup(html, features="lxml")
    
    playerStreaks = []
    streaks = soup.findAll('tr')
    for i in range(1, len(streaks)):
        streak = streaks[i]
        if streak.find('td') is not None:
            nextId = streak.find('td')['data-append-csv']
            if playerId == nextId:
                startDate = streak.findAll('a')[1].text
                endDate = streak.findAll('a')[2].text
                streakLength = streak.findAll('td')[-1].text
                
                playerStreaks.append([startDate, endDate, streakLength])
    return playerStreaks

def getYear(date):
    potentialYears = getPotentialTradeYears(date.year)
    
    firstStart = datetime.datetime.strptime(datesDict[potentialYears[0]][0], '%a, %b %d, %Y')
    firstEnd = datetime.datetime.strptime(datesDict[potentialYears[0]][1], '%a, %b %d, %Y')

    secondStart = datetime.datetime.strptime(datesDict[potentialYears[1]][0], '%a, %b %d, %Y')
    secondEnd = datetime.datetime.strptime(datesDict[potentialYears[1]][1], '%a, %b %d, %Y')

    if date >= firstStart and date <= firstEnd:
        year = potentialYears[0]
    elif date >= secondStart and date <= secondEnd:
        year = potentialYears[1]
        
    return year

def getPlayerId(playerName):
    if playerName not in playersToIds:
        sp = playerName.rindex(' ')
        truncName = playerName[:sp]
        return playersToIds[truncName]
    else:
        return playersToIds[playerName]

######## Parse Basketball Reference URL for Player's Main Page ##########
def getMainPlayerURL(playerId):
    return "https://www.basketball-reference.com/players/{}/{}.html".format(playerId[0], playerId)

# Helper for team exceptions, some teams have inconsistent team IDs
def streakFindingTeam(team):
    if team == 'BRK':
        return 'NJN'
    else:
        return team

def setupHeadlessChrome():
    options = webdriver.ChromeOptions() # don't open URL window
    options.add_argument('--headless')
    prefs = {"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
    options.add_experimental_option("prefs", prefs)
    return options

def chromeOptions():
    options = webdriver.ChromeOptions() # don't open URL window
    options.add_argument("--incognito")
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=0,0")
    options.add_argument("--window-position=100000,100000")
    options.add_argument("--disable-popup-blocking")
    prefs = {"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
    options.add_experimental_option("prefs", prefs)
    return options


In [4]:
def collectPlayerHistory(playerName):
    playerId = getPlayerId(playerName)
    playerData = {}
    
    teams = getTeams(playerId)
    trades = getTrades(playerId)
    midseasonTrades = midseasonTradeTeams(trades[1], teams)
    midseasonTradeRatings = analyzeMidseasonTrades(midseasonTrades)
    ptStreaks, astStreaks = findStreaks(playerId)
    
    playerData['TeammateTimes'] = findTotalSharedTimes(playerId)
    playerData['Teams'] = teams
    playerData['PositionRatios'] = findPositionRatios(playerId)
    playerData['Trades'] = trades
    playerData['MidseasonTradeRatings'] = midseasonTradeRatings
    playerData['PlusMinus'] = getPlusMinus(playerId)
    playerData['PointStreaks'] = ptStreaks
    playerData['AssistStreaks'] = astStreaks
    
    return playerData

In [5]:
# collectPlayerHistory('Ömer Aşık')