In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

import numpy as np
import string
import datetime
import time
import csv
import ast
import unicodedata

In [2]:
# global variables
%store -r playersToIds
%store -r idsToPlayers
%store -r datesDict
%store -r statsNBAMap

In [3]:
def readData():
    with open('allplayers.csv', mode='r') as infile:
        reader = csv.reader(infile)
        with open('allreader.csv', mode='w') as outfile:
            writer = csv.writer(outfile)
            mydict = {rows[0]:rows[1] for rows in reader}

    for name in mydict:
        val = mydict[name]
        mydict[name] = eval(val)

    return mydict

def setupHeadlessChrome():
    options = webdriver.ChromeOptions() # don't open URL window
    options.add_argument('--headless')
    prefs = {"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
    options.add_experimental_option("prefs", prefs)
    return options

def allPlayersExceptions():
    allPlayers['Frank Mason'] = allPlayers['Frank Mason III']
    allPlayers['Mohamed Bamba'] = allPlayers['Mo Bamba']
    allPlayers['Troy Brown'] = allPlayers['Troy Brown Jr.']
    allPlayers['Marvin Bagley'] = allPlayers['Marvin Bagley III']
    allPlayers['Taurean Waller-Prince'] = allPlayers['Taurean Prince']
    allPlayers['Nenê Hilário'] = allPlayers['Nenê']
    allPlayers['Luigi Datome'] = allPlayers['Gigi Datome']
    allPlayers['Vítor Faverani'] = allPlayers['Vítor Luiz Faverani']
    allPlayers['Didier Ilunga-Mbenga'] = allPlayers['D.J. Mbenga']
    allPlayers['Jakob Poeltl'] = allPlayers['Jakob Pöltl']

In [4]:
allPlayers = readData()
allPlayersExceptions()

In [22]:
def runTeams(url):
    options = setupHeadlessChrome()
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(url) 
    html = driver.page_source
    
    teamTable = driver.find_elements_by_id("team-stats-per_game")
    teamRows = teamTable[0].find_elements_by_tag_name('tbody')[0].find_elements_by_tag_name('a')
    
    res = []
    
    for i in range(0, len(teamRows)):
        teamRow = teamRows[i]
        link = teamRow.get_attribute('href')
#         print(link)
        try:
            finalRecords, wins, losses = createPlayerRow(link) #why u so dum 
            res.append([finalRecords, wins, losses])
        except KeyError as e:
            print(e)
#         print(finalRecords)
    return res

In [29]:
#['InjResp', 'ActResp', 'NetRatings', 'TeammateTimes', 'Teams', 
#'PositionRatios', 'Trades', 'MidseasonTradeRatings', 'PlusMinus', 'PointStreaks', 'AssistStreaks']
def createPlayerRow(teamUrl):
    html = urlopen(teamUrl)
    soup = BeautifulSoup(html, features="lxml")
    year = soup.find(id='meta').find('span').text
    players = soup.findAll('tr')
    
    wins, losses = getWinsLosses(teamUrl)
    
    teammates = {}
    for x in range(1, len(players)):
        playerRow = players[x]
        playerName = playerRow.find('a').text
        if collegeOnlys(playerName):
            continue
        
        if playerName not in allPlayers:
            playerName = trySuffixNames(playerName)
        teammates[playerName] = None

    for i in range(1, len(players)):
        playerRow = players[i]
        playerName = playerRow.find('a').text
        if collegeOnlys(playerName):
            continue
        if playerName not in allPlayers:
            playerName = trySuffixNames(playerName)
        playerData = allPlayers[playerName]

        indivRecord = getIndivRecord(playerData, year, playerName, teammates)
        teammates[playerName] = indivRecord

    combinedRecord = getCombinedRecord(teammates)
    finalRecords = dict()
    
    for teammate, entry in teammates.items():
        combinedWithoutPlayer = getCombinedWithoutPlayer(combinedRecord, entry, len(teammates))
        finalRecord = entry + combinedWithoutPlayer
        finalRecords[teammate] = finalRecord
    
    return finalRecords, wins, losses

def trySuffixNames(playerName):
    if playerName + " Jr." in allPlayers:
        return playerName + " Jr."
    elif playerName + " II" in allPlayers:
        return playerName + " II"
    elif playerName + " III" in allPlayers:
        return playerName + " III"
    return ''.join(c for c in unicodedata.normalize('NFD', playerName)
                   if unicodedata.category(c) != 'Mn')

def getWinsLosses(teamUrl):
    options = setupHeadlessChrome()
    driver = webdriver.Chrome(options=options) # used to view generated JavaScript
    driver.get(teamUrl)
    
    winsTable = driver.find_element_by_id('team_misc').find_element_by_tag_name('tbody').find_elements_by_tag_name('td')
    wins = int(winsTable[0].text)
    losses = int(winsTable[1].text)
    
    return wins, losses
    
def getCombinedRecord(teammates):
    combinedRecord = [0]*32
    for teammate, entries in teammates.items():
        for i in range(0, len(entries)):
            combinedRecord[i] += entries[i]
    return combinedRecord

def getCombinedWithoutPlayer(combinedRecord, entry, numTeammates):
    combinedWithoutPlayer = combinedRecord.copy()
    indicesToSubtract = [0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17,\
                        20, 21, 22, 23, 24, 26, 27, 28, 30, 31]
    for i in range(0, len(entry)):
        if i in indicesToSubtract: # gets rid of features for this player (no overlap/replication)
            combinedWithoutPlayer[i] *= numTeammates
            combinedWithoutPlayer[i] -= entry[i]
            combinedWithoutPlayer[i] /= (numTeammates - 1)
    return combinedWithoutPlayer
        
def getIndivRecord(playerData, year, playerName, teammates):    
    injResp = getInjRespFeature(playerData, year, True) # [weightedRating, totalGames] 0-1
    actResp = getInjRespFeature(playerData, year, False) # [weightedRating, totalGames] 2-3
    netRatingsList = getNetRatingsFeature(playerData, year) # ['Overall', '0 Days Rest',...] 4-11
    teammateTimes = getCurrentTeammateTimes(playerData, playerName, year, teammates) # Total time 12
    posRatios = getPositionRataioFeature(playerData, year) # [PG, SG, SF, PF, C] 13-17
    numTrades = getNumTrades(playerData, year) # number 18
    numMidTrades = getNumMidseasonTrades(playerData, year) # number 19
    midTradeResp = getMidseasonTradeFeature(playerData, year) # [rating before, rating after] 20-21
    plusMinus = getPlusMinusFeature(playerData, year) # [on court, on/off] 22-23
    pointStreaks = getPointStreaks(playerData, year, True) # [avgStreakLength, totalNumStreaks, avgStreakLengthPerYear, avgStreakCountPerYear] 24-27
    assistStreaks = getPointStreaks(playerData, year, False) # [avgStreakLength, totalNumStreaks, avgStreakLengthPerYear, avgStreakCountPerYear] 28-31

    indivRecord = []
    indivRecord.extend(injResp)
    indivRecord.extend(actResp)
    indivRecord.extend(netRatingsList)
    indivRecord.append(teammateTimes)
    indivRecord.extend(posRatios)
    indivRecord.append(numTrades)
    indivRecord.append(numMidTrades)
    indivRecord.extend(midTradeResp)
    indivRecord.extend(plusMinus)
    indivRecord.extend(pointStreaks)
    indivRecord.extend(assistStreaks)
    
    return indivRecord
        
def getInjRespFeature(playerData, year, isInjury):
    injResp = playerData['InjResp'] if isInjury else playerData['ActResp']
    return injRespFeatureHelper(injResp, year)

def injRespFeatureHelper(injResp, year):
    injuryRatings = []
    totalGames = 0
    weightedRating = 0
    
    if injResp == 0: # no injuries
        return [0, 0]
    for season, entries in injResp[1].items():
        if checkSeason(year, season):
            for injury in entries:
                rating = injury[0]
                numGames = len(injury[1])
                totalGames += numGames
                injuryRatings.append((rating, numGames))
    for inj in injuryRatings:
        weightedRating += inj[0]*(inj[1]/totalGames)
    return [weightedRating, totalGames]

def getNetRatingsFeature(playerData, year):
    netRatings = playerData["NetRatings"]
    count = 0
    
    restMap = {'Overall': 0, '0 Days Rest': 0, '1 Days Rest': 0, '2 Days Rest': 0,\
               '3 Days Rest': 0, '4 Days Rest': 0, '5 Days Rest': 0, '6+ Days Rest': 0}
    for season, entries in netRatings.items():
        perYear = {'Overall': 0, '0 Days Rest': 0, '1 Days Rest': 0, '2 Days Rest': 0,\
                   '3 Days Rest': 0, '4 Days Rest': 0, '5 Days Rest': 0, '6+ Days Rest': 0}
        if checkSeason(year, season):
            count += 1
            for team, ratings in entries.items():
                for restType, value in ratings.items():
                    perYear[restType] += float(value) * 1/(len(entries))
        for entry, val in perYear.items():
            restMap[entry] += val
    
    if count == 0:
        return [0]*8
    result = [restMap['Overall']/count, restMap['0 Days Rest']/count,\
              restMap['1 Days Rest']/count, restMap['2 Days Rest']/count,\
              restMap['3 Days Rest']/count, restMap['4 Days Rest']/count,\
              restMap['5 Days Rest']/count, restMap['6+ Days Rest']/count]
    return result # size-8 list

def getCurrentTeammateTimes(playerData, playerName, year, teammates):
    teammateTimes = playerData['TeammateTimes']
    totalTime = 0
    for season, entries in teammateTimes[1].items():
        if checkSeason(year, season):
            for oldTeammate in entries:
                if oldTeammate in teammates or juniorCheck(oldTeammate, teammates):
                    totalTime += entries[oldTeammate].total_seconds()
    return totalTime

def getPositionRataioFeature(playerData, year):
    posRatios = playerData['PositionRatios']
    res = [0, 0, 0, 0, 0]
    count = 0
    for season, entries in posRatios.items():
        if season != "Career" and checkSeason(year, season):
            count += 1
            for position, ratio in entries.items():
                res[int(position)-1] += ratio
    if count == 0:
        return res
    for i in range(0, len(res)):
        res[i] = res[i]/count
    return res

def getNumTrades(playerData, year):
    trades = playerData['Trades']
    return len(trades[0])

def getNumMidseasonTrades(playerData, year):
    trades = playerData['Trades']
    return len(trades[1])

def getMidseasonTradeFeature(playerData, year):
    weekBefore = 0
    weekAfter = 0
    count = 0
    for tradeDate, ratings in playerData['MidseasonTradeRatings'].items():
        season = getYear(tradeDate)
        if checkSeason(year, season):
            weekBefore += ratings[0] if ratings[0] is not None else 0
            weekAfter += ratings[1] if ratings[1] is not None else 0
            count += 1
    if count == 0:
        return [0, 0]
    return [weekBefore/count, weekAfter/count]

def getPlusMinusFeature(playerData, year):
    count = 0
    onCourt = 0
    onOff = 0
    plusMinusRatings = playerData['PlusMinus']
    for season, entries in plusMinusRatings.items():
        if season != "Career" and checkSeason(year, season):
            count += 1
            if entries == ['', '']: # Basketball Reference bug, see Damion James
                entries = ['0.0', '0.0']
            onCourt += float(entries[0])
            onOff += float(entries[1])
    if count == 0:
        return [0, 0]
    return [onCourt/count, onOff/count]

def getPointStreaks(playerData, year, isPoints):
    streakCount = 0
    totalStreak = 0.0
    
    yearlyAverages = []
    yearlyCounts = []
    
    pointStreaks = playerData['PointStreaks'] if isPoints else playerData['AssistStreaks']
    for season, entries in pointStreaks.items():
        if checkSeason(year, season):
            yearCount = 0
            yearStreak = 0
            for streak in entries:
                yearCount += 1
                yearStreak += float(streak[2])
            
            yearlyAverages.append(yearStreak/yearCount if yearCount != 0 else 0)
            yearlyCounts.append(yearCount)
            
            streakCount += yearCount
            totalStreak += yearStreak
    
    avgStreakLength = totalStreak/streakCount if streakCount != 0 else 0
    totalNumStreaks = sum(yearlyCounts)
    
    avgStreakLengthPerYear = sum(yearlyAverages)/len(yearlyAverages) if len(yearlyAverages) != 0 else 0
    avgStreakCountPerYear = sum(yearlyCounts)/len(yearlyCounts) if len(yearlyCounts) != 0 else 0
    
    return [avgStreakLength, totalNumStreaks, avgStreakLengthPerYear, avgStreakCountPerYear]
                
def juniorCheck(playerName, teammates): #Tim Hardaway Jr. listed as Tim Hardaway on Basketball Reference
    return playerName + " Jr." in teammates
    
def collegeOnlys(playerName):
    return playerName in ['Jontay Porter', 'Kyle Alexander', 'Devontae Cacok',\
                      'Bol Bol', "Sir'Dominic Pointer", 'Kenny Wooten', 'Dylan Windler']
    
def checkSeason(year, season):
    yIndex = year.index('-')
    sIndex = season.index('-')
    return int(season[:sIndex]) <= int(year[:yIndex])

def getYear(date):
    potentialYears = getPotentialTradeYears(date.year)
    firstStart = datetime.datetime.strptime(datesDict[potentialYears[0]][0], '%a, %b %d, %Y')
    firstEnd = datetime.datetime.strptime(datesDict[potentialYears[0]][1], '%a, %b %d, %Y')
    secondStart = datetime.datetime.strptime(datesDict[potentialYears[1]][0], '%a, %b %d, %Y')
    secondEnd = datetime.datetime.strptime(datesDict[potentialYears[1]][1], '%a, %b %d, %Y')

    if date >= firstStart and date <= firstEnd:
        year = potentialYears[0]
    elif date >= secondStart and date <= secondEnd:
        year = potentialYears[1] 
    return year

def getPotentialTradeYears(year):
    options = []
    options.append(str(year-1) + '-' + str(year)[2:])
    options.append(str(year) + '-' + str((year+1))[2:])
    if options[1] == "2020-21":
        options[1] = "2019-20"
    return options

In [17]:
yearRecords = dict()

In [33]:
for i in range(2020, 2000, -1):
    if str(i) in yearRecords:
        continue
    print(i)
    url = "https://www.basketball-reference.com/leagues/NBA_" + str(i) + ".html"
    yearResult = runTeams(url)
    yearRecords[str(i)] = yearResult

2020


In [43]:
%store yearRecords

Stored 'yearRecords' (dict)


In [45]:
yearRecords.keys()

dict_keys(['2019', '2018', '2017', '2016', '2015', '2014', '2013', '2009', '2008', '2007', '2005', '2004', '2003', '2002', '2011', '2010', '2006', '2001', '2012', '2020'])

In [44]:
%store -r yearRecords

In [13]:
allPlayers['Marty Conlon']

KeyError: 'Marty Conlon'

In [39]:
def writeData(yearRecords):
    w = csv.writer(open("yearRecords.csv", "w"))
    for key, val in yearRecords.items():
        w.writerow([key, val])

In [40]:
writeData(yearRecords)

In [41]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

def readYearRecords():
    with open('yearRecords.csv', mode='r') as infile:
        reader = csv.reader(infile)
        with open('yearReader.csv', mode='w') as outfile:
            writer = csv.writer(outfile)
            mydict = {rows[0]:rows[1] for rows in reader}

    for name in mydict:
        val = mydict[name]
        mydict[name] = eval(val)

    return mydict

In [42]:
yearRecords = readYearRecords()