In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

import string
import datetime

In [2]:
# global variables
%store -r playersToIds
%store -r idsToPlayers
%store -r datesDict
%store -r statsNBAMap

In [3]:
###### WHO HAS A PLAYER PLAYED WITH, AND FOR HOW LONG? #########
def findTotalSharedTimes(playerName):
    playerId = playersToIds[playerName]
    url = getMainPlayerURL(playerId)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    sharedTimes = {}
    seenYears = set()
    lineups = soup.find(id='inner_nav').findAll('a')
    for row in lineups:
        if 'lineups' in row['href']:
            if row['href'] in seenYears: # repeats in HTML
                break
            seenYears.add(row['href'])
            yearUrl = "https://www.basketball-reference.com" + row['href']
            print("loading JavaScript: ", yearUrl)
            yearTeammates = findSharedCourtTimes(playerId, yearUrl)
            for teammate in yearTeammates:
                timeStr = yearTeammates[teammate]
                colon = timeStr.index(':')
                timeShared = datetime.timedelta(seconds=60*int(timeStr[:colon]) + int(timeStr[colon+1:]))
                if teammate in sharedTimes:
                    sharedTimes[teammate] += timeShared
                else:
                    sharedTimes[teammate] = timeShared
    return sharedTimes

def findSharedCourtTimes(playerId, url):
    options = setupHeadlessChrome()
    
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)
    sharedTimes = {}
    
    html = driver.page_source
    twoMan = driver.find_element_by_id("div_lineups-2-man")
    table = twoMan.find_elements_by_tag_name('tr')
    
    totalRows = len(table)
    for index in range(2, totalRows-1):
        row = table[index]
        elems = row.find_elements_by_tag_name('td')
        players = elems[0].get_attribute('csk')
        minutes = elems[2].get_attribute('csk')

        colon = players.index(":")
        firstTeammate = players[:colon]
        secondTeammate = players[colon+1:]

        teammate = firstTeammate if playerId == secondTeammate else secondTeammate
        sharedTimes[idsToPlayers[teammate]] = minutes
    return sharedTimes

################# Find all dates that a player has been traded, including in-season trades #######################
def getTrades(playerName):
    playerId = playersToIds[playerName]
    url = getMainPlayerURL(playerId)
    
    options = setupHeadlessChrome()
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)
    
    html = driver.page_source
    transactions = driver.find_element_by_id("div_transactions")
    
    transList = transactions.find_elements_by_tag_name('span')
    tradeDates = []
    inSeasonTradeDates = []
    for transaction in transList:
        text = transaction.get_attribute('innerText')
        colon = text.index(':')
        dateStr = text[:colon]
        description = text[colon+1:]
        if 'trade' in description or 'Trade' in description: # not yet strong enough
            
            date = datetime.datetime.strptime(dateStr, '%B %d, %Y')
            tradeDates.append(date)
            potentialYears = getPotentialTradeYears(date.year)
        
            outOfSeason = True
            if potentialYears[0] in datesDict:
                firstDates = datesDict[potentialYears[0]]
                firstStart = datetime.datetime.strptime(firstDates[0], '%a, %b %d, %Y')
                firstEnd = datetime.datetime.strptime(firstDates[1], '%a, %b %d, %Y')
                if date >= firstStart and date <= firstEnd:
                    outOfSeason = False
            
            if potentialYears[1] in datesDict:
                lastDates = datesDict[potentialYears[1]]
                lastStart = datetime.datetime.strptime(lastDates[0], '%a, %b %d, %Y')
                lastEnd = datetime.datetime.strptime(lastDates[1], '%a, %b %d, %Y')
                if date >= lastStart and date <= lastEnd:
                    outOfSeason = False
                    
            if not outOfSeason:
                inSeasonTradeDates.append(date)
            
    return tradeDates, inSeasonTradeDates
    
def getPotentialTradeYears(year):
    options = []
    options.append(str(year-1) + '-' + str(year)[2:])
    options.append(str(year) + '-' + str((year+1))[2:])
    return options

######## FIND ALL TEAMS AN NBA PLAYER HAS PLAYED FOR ##########
def getTeams(playerName):
    playerId = playersToIds[playerName]
    url = getMainPlayerURL(playerId)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)
    seasons = soup.findAll('tr')
    teams = []
    for i in range(1, len(seasons)):
        season = seasons[i]
        if season.th:
            if season.th.getText() == "Career": # career table starts
                break
        else: # did not play this season
            continue
        teamName = season.findAll('td')[1].getText()
        if teamName != "TOT": # traded in the middle of a season
            teams.append(teamName)
    return teams

######## FIND FRACTION OF POSITIONS PLAYER HAS PLAYED IN CAREER ##########
def findPositionRatios(playerName):
    playerId = playersToIds[playerName]
    url = getMainPlayerURL(playerId)

    options = setupHeadlessChrome()
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)

    html = driver.page_source
    pbp = driver.find_element_by_id("div_pbp")
    table = pbp.find_elements_by_tag_name('tr')

    for i in range(2, len(table)): # first two rows set up table
        season = table[i]
        stats = season.find_elements_by_tag_name('td')
        if stats[0].text == "": # stats[0] is age, so an empty string means "Career" row
            ratio = {}
            for row in stats:
                stat = row.get_attribute('data-stat')
                if 'pct' in stat:
                    ratio[stat[-1]] = float(row.text.strip('%'))/100 if row.text != "" else 0.0
            return ratio
        
################# Find +/- stats for a player's individual playing years, and career ######################
def getPlusMinus(playerName):
    playerId = playersToIds[playerName]
    url = getMainPlayerURL(playerId)
    options = setupHeadlessChrome()
    
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url)
    
    html = driver.page_source
    pbp = driver.find_element_by_id("div_pbp")
    table = pbp.find_elements_by_tag_name('tr')

    plusMinusDict = {}
    for i in range(2, len(table)): # first two rows set up table
        season = table[i]
        stats = season.find_elements_by_tag_name('td')
        
        year = season.find_element_by_tag_name('a').text if stats[0].text != "" else "Career"
        onCourt = stats[11].get_attribute('innerText')
        onOff = stats[12].get_attribute('innerText')
        plusMinusDict[year] = [onCourt, onOff]
        
        if stats[0].text == "":
            return plusMinusDict

############### For a player, find all streaks that player has gone on, for both points and assists #################
def findStreaks(playerName):
    playerId = playersToIds[playerName]
    url = getMainPlayerURL(playerId)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)
    seasons = soup.findAll('tr')
    
    ptsStreakDict = {}
    astStreakDict = {}
    for i in range(1, len(seasons)):
        season = seasons[i]
        if season.th:
            if season.th.getText() == "Career": # career table starts
                break
        else: # did not play this season
            continue
        yearStr = season['id']
        colon = yearStr.index('.')
        year = yearStr[colon+1:]
        teamId = season.findAll('a')[1].text
        teamId = streakFindingTeam(teamId)
        
        if teamId == 'NBA':
            continue
        
        ptsStreaks = getStreakData(playerId, year, teamId, True, 20) # streaks over 20 points
        ptsStreakDict[year] = ptsStreaks
        
        astStreaks = getStreakData(playerId, year, teamId, False, 5) # streaks over 5 assists
        astStreakDict[year] = astStreaks
    return ptsStreakDict, astStreakDict
        
def getStreakData(playerId, year, teamId, isPts, value):
    url = "https://www.basketball-reference.com/play-index/pstreak.cgi?" \
        + "request=1&year_min={}&year_max={}&team_id={}&" \
        + "season_start=1&season_end=-1&c1stat={}&c1comp=ge&" \
        + "c1val={}"
    url = url.format(year, year, teamId, 'pts' if isPts else 'ast', value)
    print(url)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    playerStreaks = []
    streaks = soup.findAll('tr')
    for i in range(1, len(streaks)):
        streak = streaks[i]
        if streak.find('td') is not None:
            nextId = streak.find('td')['data-append-csv']
            if playerId == nextId:
                startDate = streak.findAll('a')[1].text
                endDate = streak.findAll('a')[2].text
                streakLength = streak.findAll('td')[-1].text
                
                playerStreaks.append([startDate, endDate, streakLength])
    return playerStreaks

######## Parse Basketball Reference URL for Player's Main Page ##########
def getMainPlayerURL(playerId):
    return "https://www.basketball-reference.com/players/{}/{}.html".format(playerId[0], playerId)

# Helper for team exceptions, some teams have inconsistent team IDs
def streakFindingTeam(team):
    if team == 'BRK':
        return 'NJN'
    else:
        return team

def setupHeadlessChrome():
    options = webdriver.ChromeOptions() # don't open URL window
    options.add_argument('--headless')
    prefs = {"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
    options.add_experimental_option("prefs", prefs)
    return options

In [4]:
playerName = "D'Angelo Russell"

In [5]:
findTotalSharedTimes(playerName)

loading JavaScript:  https://www.basketball-reference.com/players/r/russeda01/lineups/2016
loading JavaScript:  https://www.basketball-reference.com/players/r/russeda01/lineups/2017
loading JavaScript:  https://www.basketball-reference.com/players/r/russeda01/lineups/2018
loading JavaScript:  https://www.basketball-reference.com/players/r/russeda01/lineups/2019
loading JavaScript:  https://www.basketball-reference.com/players/r/russeda01/lineups/2020


{'Julius Randle': datetime.timedelta(days=2, seconds=9888),
 'Jordan Clarkson': datetime.timedelta(days=1, seconds=35207),
 'Kobe Bryant': datetime.timedelta(seconds=67787),
 'Roy Hibbert': datetime.timedelta(seconds=67197),
 'Brandon Bass': datetime.timedelta(seconds=45172),
 'Lou Williams': datetime.timedelta(seconds=44161),
 'Nick Young': datetime.timedelta(days=1, seconds=10368),
 'Larry Nance': datetime.timedelta(seconds=51295),
 'Marcelo Huertas': datetime.timedelta(seconds=18898),
 'Anthony Brown': datetime.timedelta(seconds=18119),
 'Brandon Ingram': datetime.timedelta(seconds=51881),
 'Luol Deng': datetime.timedelta(seconds=43759),
 'Timofey Mozgov': datetime.timedelta(seconds=47789),
 'Tarik Black': datetime.timedelta(seconds=22190),
 'Ivica Zubac': datetime.timedelta(seconds=15579),
 'Allen Crabbe': datetime.timedelta(seconds=76482),
 'DeMarre Carroll': datetime.timedelta(days=1, seconds=8799),
 'Rondae Hollis-Jefferson': datetime.timedelta(seconds=77509),
 'Jarrett Allen': 

In [6]:
getTeams(playerName)

['LAL', 'LAL', 'BRK', 'BRK', 'GSW', 'MIN']

In [7]:
findPositionRatios(playerName)

{'1': 0.9, '2': 0.1, '3': 0.0, '4': 0.0, '5': 0.0}

In [8]:
getTrades(playerName)

([datetime.datetime(2017, 6, 22, 0, 0),
  datetime.datetime(2019, 7, 7, 0, 0),
  datetime.datetime(2020, 2, 6, 0, 0)],
 [datetime.datetime(2020, 2, 6, 0, 0)])

In [9]:
getPlusMinus(playerName)

{'2015-16': ['-12.5', '-5.2'],
 '2016-17': ['-9.7', '-4.7'],
 '2017-18': ['-7.3', '-4.9'],
 '2018-19': ['+0.1', '+0.7'],
 '2019-20': ['-5.6', '-1.3'],
 'Career': ['-7.4', '-3.2']}

In [10]:
streaks = findStreaks(playerName)
print("Points streaks: ", streaks[0])
print("Assists streaks: ", streaks[1])

https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2016&year_max=2016&team_id=LAL&season_start=1&season_end=-1&c1stat=pts&c1comp=ge&c1val=20
https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2016&year_max=2016&team_id=LAL&season_start=1&season_end=-1&c1stat=ast&c1comp=ge&c1val=5
https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2017&year_max=2017&team_id=LAL&season_start=1&season_end=-1&c1stat=pts&c1comp=ge&c1val=20
https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2017&year_max=2017&team_id=LAL&season_start=1&season_end=-1&c1stat=ast&c1comp=ge&c1val=5
https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2018&year_max=2018&team_id=NJN&season_start=1&season_end=-1&c1stat=pts&c1comp=ge&c1val=20
https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2018&year_max=2018&team_id=NJN&season_start=1&season_end=-1&c1stat=ast&c1comp=

In [304]:
getStreakData('irvinky01', 2020, 'NJN', True, 20)

https://www.basketball-reference.com/play-index/pstreak.cgi?request=1&year_min=2020&year_max=2020&team_id=NJN&season_start=1&season_end=-1&c1stat=pts&c1comp=ge&c1val=20


[['2019-10-23', '2019-11-08', '8'],
 ['2020-01-23', '2020-01-31', '4'],
 ['2020-01-12', '2020-01-14', '2']]