# Part 1 - Libraries

In [2]:
#pip install yfinance
#pip install matplotlib pendulum
# pip.exe install selenium in anaconda terminal

import time
start_time = time.time() # Start time of script

# Pulling Data
import yfinance as yf
import pandas as pd
import pendulum
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import datetime
from urllib.request import urlopen
import re
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.select import Select
from selenium.webdriver.common import keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from time import sleep
import string
import unidecode

headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

chrome_options = Options()
chrome_options.add_argument("--headless")  # servers don't provide the visulazation
chrome_options.add_argument("--no-sandbox")  # operate at the highest authority
chrome_options.add_argument(
    "--disable-dev-shm-usage"
)  # increase the RAM of chrome to load the page


path = "chromedriver.exe"

# Math
import math
import numpy as np
import warnings

# Data Management
import gc

# Wrapping
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split

# Modeling
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

# For business days
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
US_BUSINESS_DAY = CustomBusinessDay(calendar=USFederalHolidayCalendar())


warnings.filterwarnings("ignore")

# Part 2 - Get Data (Basketball Reference)

## Part 2.1 - Season Stats

In [3]:
# Initial lists to be put into dataframe

seasons = []
names = []
urls = []
positions = []
ages = []
teams = []
games_played = []
games_started = []
minutes = []
fgm = []
fga = []
fgp = []
three_m = []
three_a = []
three_p = []
two_m = []
two_a = []
two_p = []
efgp = []
ftm = []
fta = []
ftp = []
orb = []
drb = []
trb = []
ast = []
stl = []
blk = []
to = []
pf = []
points = []

years = [i for i in range(2002, 2023)]

for year in years:
    
    url = 'https://www.basketball-reference.com/leagues/NBA_' + str(year) + '_per_game.html'
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content,"lxml")
    table = soup.find_all('tbody')[0]
    player_rows = table.find_all('tr')
    
    for current_row in player_rows:
    
        player_bool = current_row.find('th').get_text() != 'Rk'

        if player_bool:

            # Gathering Player Information
            player_name = current_row.find_all('td')[0].get_text()
            url_before = str(current_row.find_all('td')[0].find('a'))
            beg_ind = url_before.find('/')
            end_ind = url_before.find('>')-1    
            player_url = 'https://www.basketball-reference.com' + url_before[beg_ind:end_ind]
            player_position = current_row.find_all('td')[1].get_text()
            player_age = int(current_row.find_all('td')[2].get_text())
            player_team = current_row.find_all('td')[3].get_text()
            player_games_played = int(current_row.find_all('td')[4].get_text())
            player_games_started = int(current_row.find_all('td')[5].get_text())
            player_minutes = float(current_row.find_all('td')[6].get_text())
            player_fgm = float(current_row.find_all('td')[7].get_text())
            player_fga = float(current_row.find_all('td')[8].get_text())
            player_fgp = current_row.find_all('td')[9].get_text() # fix percentage
            player_3fgm = float(current_row.find_all('td')[10].get_text())
            player_3fga = float(current_row.find_all('td')[11].get_text())
            player_3fgp = current_row.find_all('td')[12].get_text() # fix percentage
            player_2fgm = float(current_row.find_all('td')[13].get_text())
            player_2fga = float(current_row.find_all('td')[14].get_text())
            player_2fgp = current_row.find_all('td')[15].get_text() # fix percentage
            player_efgp = current_row.find_all('td')[16].get_text() # fix percentage
            player_ftm = float(current_row.find_all('td')[17].get_text())
            player_fta = float(current_row.find_all('td')[18].get_text())
            player_ftp = current_row.find_all('td')[19].get_text() # fix percentage
            player_orb = float(current_row.find_all('td')[20].get_text())
            player_drb = float(current_row.find_all('td')[21].get_text())
            player_trb = float(current_row.find_all('td')[22].get_text())
            player_ast = float(current_row.find_all('td')[23].get_text())
            player_stl = float(current_row.find_all('td')[24].get_text())
            player_blk = float(current_row.find_all('td')[25].get_text())
            player_tov = float(current_row.find_all('td')[26].get_text())
            player_pf = float(current_row.find_all('td')[27].get_text())
            player_pts = float(current_row.find_all('td')[28].get_text())
            
            # Fixing Team
            if player_team == 'NJN':
                player_team = 'BKN'
            elif player_team == 'CHH':
                player_team = 'CHA'
            elif player_team == 'NOK':
                player_team = 'NOP'
            elif player_team == 'BRK':
                player_team = 'BKN'
            elif player_team == 'CHO':
                player_team = 'CHA'
            elif player_team == 'NOH':
                player_team = 'NOP'
            elif player_team == 'PHO':
                player_team = 'PHX'
            elif player_team == 'SEA':
                player_team = 'OKC'
                
            # Fixing Positions
            if '-' in player_position:
                player_position = player_position.split('-')[0]
                
            if (player_position == 'PG') | (player_position == 'SG'):
                player_position = 'G'
            elif (player_position == 'SF') | (player_position == 'PF'):
                player_position = 'F'
                
            # Fixing Empty Stats / Percentages
            if player_fgp == '':
                player_fgp = 0
            else:
                player_fgp = float(player_fgp)

            if player_3fgp == '':
                player_3fgp = 0
            else:
                player_3fgp = float(player_3fgp)

            if player_2fgp == '':
                player_2fgp = 0
            else:
                player_2fgp = float(player_2fgp)

            if player_efgp == '':
                player_efgp = 0
            else:
                player_efgp = float(player_efgp)

            if player_ftp == '':
                player_ftp = 0
            else:
                player_ftp = float(player_ftp)

            # Adding to List
            seasons.append(year)
            names.append(player_name)
            urls.append(player_url)
            positions.append(player_position)
            ages.append(player_age)
            teams.append(player_team)
            games_played.append(player_games_played)
            games_started.append(player_games_started)
            minutes.append(player_minutes)
            fgm.append(player_fgm)
            fga.append(player_fga)
            fgp.append(player_fgp)
            three_m.append(player_3fgm)
            three_a.append(player_3fga)
            three_p.append(player_3fgp)
            two_m.append(player_2fgm)
            two_a.append(player_2fga)
            two_p.append(player_2fgp)
            efgp.append(player_efgp)
            ftm.append(player_ftm)
            fta.append(player_fta)
            ftp.append(player_ftp)
            orb.append(player_orb)
            drb.append(player_drb)
            trb.append(player_trb)
            ast.append(player_ast)
            stl.append(player_stl)
            blk.append(player_blk)
            to.append(player_tov)
            pf.append(player_pf)
            points.append(player_pts)

player_info_df = pd.DataFrame({'Season': seasons, 
                              'Player_Name': names,
                              'Player_URL': urls,
                              'Player_Position': positions,
                              'Player_Age': ages,                               
                              'Player_Team': teams,  
                              'Player_Games_Played': games_played,
                              'Player_Games_Started': games_started,
                              'Player_Minutes': minutes,
                              'Player_Points': points,
                              'Player_FGM': fgm,
                              'Player_FGA': fga,
                              'Player_FG%': fgp,
                              'Player_3FGM': three_m,
                              'Player_3FGA': three_a,
                              'Player_3FG%': three_p,
                              'Player_2FGM': two_m,
                              'Player_2FGA': two_a,
                              'Player_2FG%': two_p,
                              'Player_EFG%': efgp,
                              'Player_FTM': ftm,
                              'Player_FTA': fta,
                              'Player_FT%': ftp,
                              'Player_ORB': orb,
                              'Player_DRB': drb,
                              'Player_TRB': trb,
                              'Player_Ast': ast,
                              'Player_STL': stl,
                              'Player_BLK': blk,
                              'Player_TO': to,
                              'Player_PF': pf,
                              'Player_Pts': points})          

## Part 2.2 - Current Year Players

In [4]:
# Initial Lists

names = []
teams = []
positions = []

# Selenium

Path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(Path)
url = 'https://www.nba.com/players'
driver.get(url)
WebDriverWait(driver, 1000)
link = Select(driver.find_element(By.XPATH, value = '/html/body/div[1]/div[2]/div[2]/main/div[2]/section/div/div[2]/div[1]/div[7]/div/div[3]/div/label/div/select'))
link.select_by_index(0)

# Beautiful Soup

page = driver.page_source
soup = BeautifulSoup(page,"lxml")
table = soup.find_all('tbody')[0]
player_rows = table.find_all('tr')

year = years[-1] + 1

for current_row in player_rows:
    
    # Gathering Player Information
    player_name = current_row.find_all('td')[0].find_all('div')[1]
    first_name = player_name.find_all('p')[0].get_text()
    second_name = player_name.find_all('p')[1].get_text()
    player_name = first_name + ' ' + second_name
    player_team = current_row.find_all('td')[1].get_text()
    player_position = current_row.find_all('td')[3].get_text()
    if '-' in player_position:
        player_position = player_position.split('-')[0]
    
    # Adding to Lists
    names.append(player_name)
    teams.append(player_team)
    positions.append(player_position)

current_player_df = pd.DataFrame({
    'Season': year,
    'Player_Name': names,
    'Player_Team': teams,
    'Player_Position': positions
})

player_info_df = player_info_df.append(current_player_df).reset_index().drop('index', axis = 1)

## Part 2.3 - Draft Information

In [5]:
names = []
picks = []
teams = []
seasons = []

for year in range(2001, 2023):
    url = 'https://www.basketball-reference.com/draft/NBA_' + str(year) + '.html'
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content,"lxml")
    draftees = soup.find('tbody').find_all('tr')

    for d in draftees:
        draftee_info = d.find_all('td')
        if len(draftee_info) > 1:
            player_pick = int(draftee_info[0].get_text())
            player_team = draftee_info[1].get_text()
            # Fixing Team
            if player_team == 'NJN':
                player_team = 'BKN'
            elif player_team == 'CHH':
                player_team = 'CHA'
            elif player_team == 'NOK':
                player_team = 'NOP'
            elif player_team == 'BRK':
                player_team = 'BKN'
            elif player_team == 'CHO':
                player_team = 'CHA'
            elif player_team == 'NOH':
                player_team = 'NOP'
            elif player_team == 'PHO':
                player_team = 'PHX'
            elif player_team == 'SEA':
                player_team = 'OKC'
            player_name = draftee_info[2].get_text()
            names.append(player_name)
            picks.append(player_pick)
            teams.append(player_team)
            seasons.append(year+1)
            
draft_df = pd.DataFrame({
    'Season': seasons,
    'Player_Pick': picks,
    'Player_Team': teams,
    'Player_Name': names
})

## Part 2.4 - Separating Draftees by College and International

In [6]:
high_school_players = ['Kwame Brown', 'Tyson Chandler', 'Eddy Curry', "Amar'e Stoudemire", \
                      'LeBron James', 'Travis Outlaw', 'Kendrick Perkins', 'Dwight Howard', \
                      'Shaun Livingston', 'Robert Swift', 'Sebastian Telfair', 'Al Jefferson', \
                      'Josh Smith', 'J.R. Smith', 'Dorell Wright', 'Martell Webster', 'Andrew Bynum', \
                      'Gerald Green', 'C.J. Miles', 'Monta Ellis', 'Lou Williams', 'Amir Johnson', \
                      'James Lang', 'DeSagana Diop']

international_players = ['Pau Gasol', 'Vladimir Radmanović', 'Raül López', \
                         'Tony Parker', 'Mehmet Okur', 'Ousmane Cisse', 'Antonis Fotsis', 'Yao Ming', \
                        'Nikoloz Tskitishvili', 'Nenê', 'Boštjan Nachbar', 'Jiří Welsch', \
                        'Nenad Krstić', 'Milos Vujanic', 'David Andersen', 'Juan Carlos Navarro', \
                        'Mario Kasun', 'Peter Fehse', 'Federico Kammerichs', 'Mladen Sekularac', \
                        'Luis Scola', 'Darko Miličić', 'Mickaël Piétrus', 'Žarko Čabarkapa', \
                        'Sasha Pavlović', 'Boris Diaw', 'Zoran Planinić', 'Carlos Delfino', \
                        'Ndudi Ebi', 'Leandro Barbosa', 'Maciej Lampe', 'Sofoklis Schortsanitis', \
                        'Szymon Szewczyk', 'Slavko Vraneš', 'Zaza Pachulia', 'Malick Badiane', \
                        'Sani Bečirovič', 'Xue Yuyang', 'Andreas Glyniadakis', 'Rafael Araújo', \
                        'Andris Biedriņš', 'Pavel Podkolzin', 'Viktor Khryapa', 'Sergei Monia', \
                        'Sasha Vujačić', 'Beno Udrih', 'Anderson Varejão', 'Peter John Ramos', \
                         'Albert Miralles', 'Viktor Sanikidze', 'Ha Seung-Jin', 'Sergei Lishouk', \
                         'Vassilis Spanoulis', 'Sergei Karaulov', 'Fran Vázquez', 'Yaroslav Korolev', \
                         'Francisco García', 'Johan Petro', 'Ian Mahinmi', 'Ricky Sánchez', \
                         'Ersan İlyasova', 'Roko Ukić', 'Mile Ilić', 'Martynas Andriuškevičius', \
                         'Erazem Lorbek', 'Mickaël Gelabale', 'Andray Blatche', 'Axel Hervelle', \
                         'Marcin Gortat', 'Uroš Slokar', 'Cenk Akyol', 'Andrea Bargnani', \
                         'Thabo Sefolosha', 'Oleksiy Pecherov', 'Sergio Rodríguez', 'Joel Freeland', \
                         'Kosta Perović', 'Marcus Vinicius', 'Lior Eliyahu', 'Vladimir Veremeenko', \
                         'Cheikh Samb', 'Guillermo Díaz', 'Yotam Halperin', 'Ejike Ugboaja', \
                         'Edin Bavčić', 'Loukas Mavrokefalidis', 'Damir Markota', 'Yi Jianlian', \
                         'Marco Belinelli', 'Rudy Fernández', 'Tiago Splitter', 'Petteri Koponen', \
                         'Kyrylo Fesenko', 'Stanko Barać', 'Sun Yue', 'Stéphane Lasme', \
                         'Marc Gasol', 'Brad Newley', 'Georgios Printezis', 'Milovan Raković', \
                         'Danilo Gallinari', 'Alexis Ajinça', 'Serge Ibaka', 'Nicolas Batum', \
                         'Nikola Peković', 'Ömer Aşık', 'Nathan Jawai', \
                         'Ante Tomić', 'Goran Dragić', 'Tadija Dragićević', 'Semih Erden', \
                         'Ricky Rubio', 'Brandon Jennings', 'Victor Claver', 'Omri Casspi', \
                         'Byron Mullens', 'Rodrigue Beaubois', 'Sergio Llull', 'Jonas Jerebko', \
                         'Henk Norel', 'Sergiy Gladyr', 'Nando De Colo', 'Patty Mills', \
                         'Ahman Nivins', 'Emir Preldžić', 'Kevin Séraphin', 'Greivis Vásquez', \
                         'Tibor Pleiß', 'Nemanja Bjelica', 'Paulão Prestes', 'Ryan Richards', \
                         'Pape Sy', 'Enes Freedom', 'Jonas Valančiūnas', 'Jan Veselý', \
                         'Bismack Biyombo', 'Nikola Vučević', 'Donatas Motiejūnas', 'Nikola Mirotić', \
                         'Bojan Bogdanović', 'Jeremy Tyler', 'Dāvis Bertāns', 'Milan Mačvan', \
                         'Chukwudiebere Maduabum', 'Tanguy Ngombo', 'Ádám Hanga', 'Evan Fournier', \
                         "Tomáš Satoranský", 'Kostas Papanikolaou', 'Ognjen Kuzmić', \
                         'Furkan Aldemir', 'Tornike Shengelia', 'Tomislav Zubčić', \
                         'İlkan Karaman', 'Giannis Antetokounmpo', 'Lucas Nogueira', \
                         'Dennis Schröder', 'Sergey Karasev', 'Rudy Gobert', 'Livio Jean-Charles', \
                         'Nemanja Nedović', 'Álex Abrines', 'Marko Todorović', 'Raul Neto', \
                         'Joffrey Lauvergne', 'Bojan Dubljević', 'Jānis Timma', 'Dante Exum', \
                         'Dario Šarić', 'Jusuf Nurkić', 'Bruno Caboclo', 'Clint Capela', \
                         'Bogdan Bogdanović', 'Damien Inglis', 'Nikola Jokić', 'Edy Tavares', \
                         'Thanasis Antetokounmpo', 'Vasilije Micić', 'Alessandro Gentile', \
                         'Nemanja Dangubić', 'Louis Labeyrie', 'Kristaps Porziņģis', \
                        'Emmanuel Mudiay', 'Nikola Milutinov', 'Cedi Osman', 'Willy Hernangómez', \
                        'Juan Pablo Vaulet', 'Artūras Gudaitis', 'Marcus Eriksson', \
                        'Satnam Singh', 'Dani Díez', 'Nikola Radičević', 'Dimitrios Agravanis', \
                        'Luka Mitrović', 'Dragan Bender', 'Georgios Papagiannis', \
                        'Juancho Hernangómez', 'Guerschon Yabusele', 'Ante Žižić', \
                        'Timothé Luwawu-Cabarrot', 'Furkan Korkmaz', 'Skal Labissière', \
                        'Ivica Zubac', 'Rade Zagorac', 'David Michineau', 'Stephen Zimmerman', \
                        'Zhou Qi', 'Isaïa Cordinier', 'Paul Zipser', 'Petr Cornelie', \
                        'Wang Zhelin', 'Frank Ntilikina', 'Terrance Ferguson', 'Anžejs Pasečņiks', \
                        'Isaiah Hartenstein', 'Vlatko Čančar', 'Mathias Lessort', 'Mouhamed Sene', \
                        'Renaldas Seibutis', 'Christian Eyenga', 'İzzet Türkyılmaz', 'Thon Maker', \
                        'Mathias Lessort', 'Sasha Vezenkov', 'Alpha Kaba', 'Luka Dončić', \
                        'Džanan Musa', 'Elie Okobo', 'Isaac Bonga',  \
                        'Rodions Kurucs', 'Issuf Sanon', 'Arnoldas Kulboka', 'Sekou Doumbouya', \
                        'Goga Bitadze', 'Luka Šamanić', 'Didi Louzada', 'Deividas Sirvydis', \
                        'Alen Smailagić', 'Vanja Marinković', 'LaMelo Ball', 'Killian Hayes', \
                        'Deni Avdija', 'Aleksej Pokusevski', 'Leandro Bolmaro', 'R.J. Hampton', \
                        'Théo Maledon', 'Vit Krejci', 'Marko Simonovic', 'Yam Madar', \
                        'Josh Giddey', 'Alperen Şengün', 'Usman Garuba', 'Rokas Jokubaitis', \
                        'Juhann Begarin', 'Filip Petrušev', 'Balša Koprivica', 'Georgios Kalaitzakis', \
                        'Nikola Jović', 'Gabriele Procida', 'Khalifa Diop', 'Ousmane Dieng', \
                        'Ismael Kamagate', 'Matteo Spagnolo', 'Karlo Matković', \
                        'Yannick Nzosa', 'Gui Santos', 'Luke Travers', 'Hugo Besson', 'Paccelis Morlende', \
                        'Remon van de Hare', 'Nedžad Sinanović', 'Mario Hezonja', 'Ognjen Jaramaz']

g_league_players = ['Jalen Green', 'Jonathan Kuminga', 'Isaiah Todd', 'Jaden Hardy', 'Dyson Daniels']
                         
no_record_players = ['Kedrick Brown', 'Qyntel Woods', 'Ronald Murray', 'Jerome Beasley', \
                    'Donta Smith', 'David Young', 'Latavious Williams', 'Ricky Ledo', \
                    'Anfernee Simons', 'Mitchell Robinson', 'Darius Bazley', \
                    'Kenyon Martin Jr.', 'Jay Scrubb', 'Reggie Perry', 'Shaedon Sharpe', \
                    'MarJon Beauchamp']
                         
two_players = ['Ken Johnson', 'Mike Dunleavy', 'Bobby Jones', 'Dee Brown', 'Gerald Henderson', \
              'James Johnson', 'Derrick Williams', 'Jordan Hamilton', 'Thomas Robinson', \
              'Jared Cunningham', 'Justin Hamilton', 'Reggie Bullock', 'Ray McCallum', \
              'Devin Booker', 'Josh Richardson', 'Demetrius Jackson', \
              'Josh Jackson', 'Frank Jackson', 'Jerome Robinson', \
              'Bruce Brown', 'Vince Edwards', 'Justin James', 'Anthony Edwards', \
              'Patrick Williams', 'Isaiah Stewart', 'Josh Green', 'Nick Richards', \
              'Jalen Harris', 'Keon Johnson', ]
                         
punctuation_players = ['T.J. Ford', "Patrick O'Bryant", 'J.J. Redick', 'P.J. Tucker', 'J.R. Pinnock', \
             'D.J. Strawberry', 'O.J. Mayo', 'D.J. Augustin', 'J.J. Hickson', \
             'D.J. White', 'J.R. Giddens', 'A.J. Price', "Da'Sean Butler", "Hamady N'Diaye", \
             "E'Twaun Moore", "Kyle O'Quinn", 'Tim Hardaway Jr.', 'Glen Rice Jr.', 'T.J. Warren', \
                      'P.J. Hairston', 'C.J. Wilcox', 'K.J. McDaniels', "Johnny O'Bryant", \
                      "D'Angelo Russell", 'R.J. Hunter', "Sir'Dominic Pointer", 'J.P. Tokoto', \
                      "DeAndre' Bembry", 'A.J. Hammons', "De'Aaron Fox", 'D.J. Wilson', \
                      'T.J. Leaf', "Devonte' Graham", "De'Anthony Melton", "De'Andre Hunter", \
                      'P.J. Washington', "Jahmi'us Ramsey", "Day'Ron Sharpe", "E.J. Liddell", \
                      ]
                         
edge_case_player_dtc = {
    'Donté Greene': 'donte-greene-1',
    'Patrick Ewing': 'pat-ewing-1',
    'Henry Walker': 'bill-walker-1',
    'Jeff Ayres': 'jeff-pendergraph-1',
    'Wesley Johnson': 'wes-johnson-1',
    'Anthony Davis': 'anthony-davis-5',
    'Jeff Taylor': 'jeffery-taylor-1',
    'Bernard James': 'bernard-james--1',
    'Mike Scott': 'mike-scott-4',
    'Otto Porter Jr.': 'otto-porter-1',
    'Tony Mitchell': 'tony-mitchell-4',
    'Ryan Kelly': 'ryan-kelly-3',
    'James Ennis III': 'james-ennis-1',
    'Joe Harris': 'joe-harris--1',
    'Glenn Robinson III': 'glenn-robinson-2',
    'Devyn Marble': 'roy-devyn-marble-1',
    'Kelly Oubre Jr.': 'kelly-oubre-1',
    'Larry Nance Jr.': 'larry-nance-2',
    'Anthony Brown': 'anthony-brown-5',
    'Joe Young': 'joseph-young-1',
    'Aaron White': 'aaron-white-3',
    'Kay Felder': 'kahlil-felder-1',
    'Dennis Smith Jr.': 'dennis-smithjr-1',
    'Bam Adebayo': 'edrice-adebayo-1',
    'Wes Iwundu': 'wesley-iwundu-1',
    'Frank Mason III': 'frank-mason-1',
    'Sterling Brown': 'sterling-brown-2',
    'Kyle Anderson': 'kyle-anderson-3',
    'Marvin Bagley III': 'marvin-bagleyiii-1',
    'Jaren Jackson Jr.': 'jaren-jacksonjr-1',
    'Mo Bamba': 'mohamed-bamba-1',
    'Wendell Carter Jr.': 'wendell-carterjr-1',
    'Michael Porter Jr.': 'michael-porterjr-1',
    'Troy Brown Jr.': 'troy-brown-5',
    'Lonnie Walker IV': 'lonnie-walker-2',
    'Robert Williams': 'robert-williams-3',
    'Gary Trent Jr.': 'gary-trentjr-1',
    'Svi Mykhailiuk': 'sviatoslav-mykhailiuk-1',
    'Shake Milton': 'malik-milton-1',
    'Ray Spalding': 'raymond-spalding-1',
    'Ja Morant': 'temetrius-morant-1',
    'Cameron Johnson': 'cameron-johnson-4',
    'Kevin Porter Jr.': 'kevin-porterjr-1',
    'Nic Claxton': 'nicolas-claxton-1',
    'KZ Okpala': 'kezie-okpala-1',
    'Dewan Hernandez': 'dewan-huell-1',
    'Obi Toppin': 'obadiah-toppin-1',
    'Kira Lewis Jr.': 'kira-lewisjr-1',
    'Vernon Carey Jr.': 'vernon-careyjr-1',
    'Xavier Tillman Sr.': 'xavier-tillman-1',
    'Robert Woodard II': 'robert-woodard-2',
    'Paul Reed': 'paul-reed-5',
    'Trey Murphy III': 'trey-murphyiii-1',
    'Jalen Johnson': 'jalen-johnson-24',
    'Isaiah Jackson': 'isaiah-jackson-3',
    'Bones Hyland': 'nahshon-hyland-1',
    'Cam Thomas': 'cameron-thomas-1',
    'Herbert Jones': 'herb-jones-3',
    'Greg Brown III': 'greg-brown-9',
    'David Johnson': 'david-johnson-13',
    'Brandon Boston Jr.': 'brandon-bostonjr-1',
    'Jabari Smith Jr.': 'jabari-smith-2',
    'Johnny Davis': 'jonathan-davis-3',
    'Jalen Williams': 'jalen-williams-13',
    'Mark Williams': 'mark-williams-7',
    'Wendell Moore Jr.': 'wendell-moorejr-1',
    'Patrick Baldwin Jr.': 'patrick-baldwinjr-1',
    'TyTy Washington Jr.': 'tyty-washingtonjr-1',
    'Vince Williams Jr.': 'vince-williams-2',
    'Luc Mbah a Moute': 'luc-richard-mbah-a-moute-1',
    'Moussa Diabaté': 'moussa-diabate-1'
    
}

duplicate_players = ['Marcus Williams', 'Marcus Thornton', 'Justin Jackson']

## Part 2.5 - Gathering College Player Links

In [7]:
college_links = []

for row in range(len(draft_df)):
    
    # Season and Player Name
    s = draft_df.iloc[row,0]
    n = draft_df.iloc[row,3]
    
    # Skip players with no college links
    if (n not in high_school_players) & (n not in international_players) & (n not in g_league_players) \
    & (n not in no_record_players):
        
        # Find Link for players with -2 ending for college stats
        if n in two_players:
            link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-2.html'
        # Find link for players with periods and quotes in name
        elif n in punctuation_players:
            temp_n = re.sub(r'[^\w\s]', '', n)
            temp_n2 = re.sub(' ', '-', temp_n.lower())
            link = 'https://sports-reference.com/cbb/players/' + temp_n2 + '-1.html'
        # Find link for players with unusual links
        elif n in list(edge_case_player_dtc.keys()):
            link = 'https://sports-reference.com/cbb/players/' + edge_case_player_dtc[n] + '.html'
        elif n in duplicate_players:
            if (n == 'Marcus Williams') & (s == 2007):
                link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-1.html'
            elif (n == 'Marcus Williams') & (s == 2008):
                link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-2.html'
            elif (n == 'Marcus Thornton') & (s == 2010):
                link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-1.html'
            elif (n == 'Marcus Thornton') & (s == 2016):
                link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-3.html'            
            elif (n == 'Justin Jackson') & (s == 2018):
                link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-4.html'
            elif (n == 'Justin Jackson') & (s == 2019):
                link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-5.html'   
        else:
            link = 'https://sports-reference.com/cbb/players/' + re.sub(' ', '-', n.lower()) + '-1.html'
            
        # Adding links to list
        college_links.append(link)

## Part 2.6 - Using College Links to Get Stats

In [8]:
# List for College and International Players

names = []
colleges = []
games_played = []
games_started = []
minutes = []
fgm = []
fga = []
fgp = []
two_fgm = []
two_fga = []
two_fgp = []
three_fgm = []
three_fga = []
three_fgp = []
ftm = []
fta = []
ftp = []
orb = []
drb = []
trb = []
ast = []
stl = []
blk = []
tov = []
pf = []
pts = []
sos = []

In [9]:
for url in college_links:
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content,"lxml")
    name = soup.find('h1').find('span').get_text()
    if name == 'Jabari Smith':
        name = 'Jabari Smith Jr.'
    table = soup.find_all('tfoot')[0]
    player_college_career_stats = table.find_all('td')
    player_college = player_college_career_stats[0].get_text()
    player_games_played = int(player_college_career_stats[2].get_text())
    player_games_started = int(player_college_career_stats[3].get_text())
    player_minutes = float(player_college_career_stats[4].get_text())
    player_fgm = float(player_college_career_stats[5].get_text())
    player_fga = float(player_college_career_stats[6].get_text())
    player_fgp = player_college_career_stats[7].get_text()
    if player_fgp == '':
        player_fgp = 0.0
    else:
        player_fgp = float(player_fgp)
    player_2fgm = float(player_college_career_stats[8].get_text())
    player_2fga = float(player_college_career_stats[9].get_text())
    player_2fgp = player_college_career_stats[10].get_text()
    if player_2fgp == '':
        player_2fgp = 0.0
    else:
        player_2fgp = float(player_2fgp)
    player_3fgm = float(player_college_career_stats[11].get_text())
    player_3fga = float(player_college_career_stats[12].get_text())
    player_3fgp = player_college_career_stats[13].get_text()
    if player_3fgp == '':
        player_3fgp = 0.0
    else:
        player_3fgp = float(player_3fgp)
    player_ftm = float(player_college_career_stats[14].get_text())
    player_fta = float(player_college_career_stats[15].get_text())
    player_ftp = player_college_career_stats[16].get_text()
    if player_ftp == '':
        player_ftp = 0.0
    else:
        player_ftp = float(player_ftp)
    player_orb = player_college_career_stats[17].get_text()
    if player_orb == '':
        player_orb = np.nan
    else:
        player_orb = float(player_orb)
    player_drb = player_college_career_stats[18].get_text()
    if player_drb == '':
        player_drb = np.nan
    else:
        player_drb = float(player_drb)
    player_trb = player_college_career_stats[19].get_text()
    if player_trb == '':
        player_trb = np.nan
    else:
        player_trb = float(player_trb)
    player_ast = player_college_career_stats[20].get_text()
    if player_ast == '':
        player_ast = 0
    else:
        player_ast = float(player_ast)
    player_stl = player_college_career_stats[21].get_text()
    if player_stl == '':
        player_stl = 0
    else:
        player_stl = float(player_stl)
    player_blk = player_college_career_stats[22].get_text()
    if player_blk == '':
        player_blk = 0
    else:
        player_blk = float(player_blk)
    player_tov = player_college_career_stats[23].get_text()
    if player_tov == '':
        player_tov = 0
    else:
        player_tov = float(player_tov)
    player_pf = player_college_career_stats[24].get_text()
    if player_pf == '':
        player_pf = 0
    else:
        player_pf = float(player_pf)
    player_pts = player_college_career_stats[25].get_text()
    if player_pts == '':
        player_pts = 0
    else:
        player_pts = float(player_pts)
    player_sos = player_college_career_stats[27].get_text()
    if player_sos == '':
        player_sos = np.nan
    else:
        player_sos = float(player_sos)
    
    # Add to Lists

    names.append(name)
    colleges.append(player_college) 
    games_played.append(player_games_played) 
    games_started.append(player_games_started) 
    minutes.append(player_minutes)
    fgm.append(player_fgm) 
    fga.append(player_fga) 
    fgp.append(player_fgp) 
    two_fgm.append(player_2fgm)
    two_fga.append(player_2fga)
    two_fgp.append(player_2fgp)
    three_fgm.append(player_3fgm) 
    three_fga.append(player_3fga) 
    three_fgp.append(player_3fgp) 
    ftm.append(player_ftm) 
    fta.append(player_fta)
    ftp.append(player_ftp) 
    orb.append(player_orb) 
    drb.append(player_drb) 
    trb.append(player_trb)
    ast.append(player_ast) 
    stl.append(player_stl) 
    blk.append(player_blk) 
    tov.append(player_tov) 
    pf.append(player_pf) 
    pts.append(player_pts)
    sos.append(player_sos) 

## Part 2.7 - Gathering International Player Links

In [10]:
# Identifying International links with no data or unusual hyperlinks

international_no_stats = ['Vladimir Radmanović', 'Tony Parker', 'Ousmane Cisse', 'Antonis Fotsis', \
                         'Nenê', 'Mario Kasun', 'Peter Fehse', 'Federico Kammerichs', 'Mladen Sekularac', \
                         'Darko Miličić', 'Ndudi Ebi', 'Leandro Barbosa', 'Szymon Szewczyk', \
                         'Malick Badiane', 'Xue Yuyang', 'Rafael Araújo', 'Andris Biedriņš', \
                         'Pavel Podkolzin', 'Peter John Ramos', 'Ha Seung-Jin', 'Sergei Lishouk', \
                         'Sergei Karaulov', 'Yaroslav Korolev', 'Francisco García', 'Ricky Sánchez', \
                         'Andray Blatche', 'Oleksiy Pecherov', 'Joel Freeland', 'Marcus Vinicius', \
                         'Lior Eliyahu', 'Vladimir Veremeenko', 'Cheikh Samb', 'Guillermo Díaz', \
                         'Ejike Ugboaja', 'Edin Bavčić', 'Yi Jianlian', 'Petteri Koponen', 'Kyrylo Fesenko', \
                         'Stanko Barać', 'Sun Yue', 'Stéphane Lasme', 'Brad Newley', 'Georgios Printezis', \
                         'Milovan Raković', 'Serge Ibaka', 'Nathan Jawai', 'Ante Tomić', \
                         'Byron Mullens', 'Sergiy Gladyr', 'Ahman Nivins', 'Greivis Vásquez', \
                         'Greivis Vásquez', 'Paulão Prestes', 'Ryan Richards', 'Nikola Vučević', \
                         'Jeremy Tyler', 'Dāvis Bertāns', 'Chukwudiebere Maduabum', \
                         'Tanguy Ngombo', 'Ádám Hanga', 'İlkan Karaman', 'Giannis Antetokounmpo', \
                         'Dennis Schröder', 'Jānis Timma', 'Dante Exum', 'Bruno Caboclo', \
                         'Nikola Jokić', 'Edy Tavares', 'Thanasis Antetokounmpo', \
                         'Vasilije Micić', 'Nemanja Dangubić', 'Juan Pablo Vaulet', 'Satnam Singh', \
                         'Dani Díez', 'Dimitrios Agravanis', 'Georgios Papagiannis', 'Ante Žižić', \
                         'Skal Labissière', 'Ivica Zubac', 'Rade Zagorac', 'Stephen Zimmerman', \
                         'Wang Zhelin', 'Anžejs Pasečņiks', 'Mouhamed Sene', 'Thon Maker', \
                         'Sasha Vezenkov', 'Isaac Bonga', 'Issuf Sanon', 'Luka Šamanić', \
                         'Didi Louzada', 'Alen Smailagić', 'Yam Madar', 'Juhann Begarin', \
                         'Balša Koprivica', 'Gui Santos', 'Nedžad Sinanović', 'Ognjen Jaramaz', 'Mehmet Okur']


international_dict = {
    'Enes Freedom': 'enes-kanter-1',
    'Willy Hernangómez': 'guillermo-hernangomez-1',
    'Juancho Hernangómez': 'juan-hernangomez-1',
    'Sekou Doumbouya': 'sekou-omar-doumbouya-1',
    'R.J. Hampton': 'rj-hampton-1'
}

In [11]:
international_links = []
international_seasons = []

for row in range(len(draft_df)):
    
    # Season and Player Name
    s = draft_df.iloc[row,0]
    n = draft_df.iloc[row,3]
    
    # Only look at international plauers
    if (n in international_players) & (n not in international_no_stats):
        international_seasons.append(s)
        # Find Link for players with -2 ending for college stats
        if n in list(international_dict.keys()):
            link = 'https://www.basketball-reference.com/international/players/' + international_dict[n] + '.html' 
        else:
            unicode = unidecode.unidecode(n)
            link = 'https://www.basketball-reference.com/international/players/' + re.sub(' ', '-', unicode.lower()) + '-1.html'
            
        # Adding links to list
        international_links.append(link)

## Part 2.8 - Using International Links to Get Stats

In [12]:
for i in range(len(international_links)):
    url = international_links[i]
    season = international_seasons[i]    
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content,"lxml")
    name = soup.find('h1').find('span').get_text().split('International')[0].strip()
    table = soup.find_all('tbody')[0]
    player_league_career_by_year = table.find_all('tr')

    player_college = 'International'
    player_games_played = 0
    player_games_started = 0
    player_minutes = 0
    player_fgm = 0
    player_fga = 0
    player_fgp = 0
    player_2fgm = 0
    player_2fga = 0
    player_2fgp = 0
    player_3fgm = 0
    player_3fga = 0
    player_3fgp = 0
    player_ftm = 0
    player_fta = 0
    player_ftp = 0
    player_orb = 0
    player_drb = 0
    player_trb = 0
    player_ast = 0
    player_stl = 0
    player_blk = 0
    player_tov = 0
    player_pf = 0
    player_pts = 0
    player_sos = 0

    for year in player_league_career_by_year:
        current_year = year.find('th').get_text()
        if len(current_year) > 4:
            current_year = int(current_year[:2] + current_year[-2:])
            if current_year == 1900:
                current_year = 2000
        else:
            current_year = int(current_year)
        if current_year < season:

            # Current Seasons Stats
            games_played_this_season = int(year.find_all('td')[3].get_text())
            minutes_this_season = float(year.find_all('td')[4].get_text())
            fgm_this_season = float(year.find_all('td')[5].get_text())
            fga_this_season = float(year.find_all('td')[6].get_text())
            three_fgm_this_season = float(year.find_all('td')[8].get_text())
            three_fga_this_season = float(year.find_all('td')[9].get_text())
            two_fgm_this_season = float(year.find_all('td')[11].get_text())
            two_fga_this_season = float(year.find_all('td')[12].get_text())
            ftm_this_season = float(year.find_all('td')[14].get_text())
            fta_this_season = float(year.find_all('td')[15].get_text())
            orb_this_season = float(year.find_all('td')[17].get_text())
            drb_this_season = float(year.find_all('td')[18].get_text())
            trb_this_season = float(year.find_all('td')[19].get_text())
            ast_this_season = float(year.find_all('td')[20].get_text())
            stl_this_season = float(year.find_all('td')[21].get_text())
            blk_this_season = float(year.find_all('td')[22].get_text())
            tov_this_season = float(year.find_all('td')[23].get_text())
            pf_this_season = float(year.find_all('td')[24].get_text())
            pts_this_season = float(year.find_all('td')[25].get_text())

            # Make a Running Total
            player_games_played = player_games_played + games_played_this_season
            player_minutes = player_minutes + minutes_this_season * games_played_this_season
            player_fgm = player_fgm + fgm_this_season * games_played_this_season 
            player_fga = player_fga + fga_this_season * games_played_this_season
            player_2fgm = player_2fgm + two_fgm_this_season * games_played_this_season
            player_2fga = player_2fga + two_fga_this_season * games_played_this_season
            player_3fgm = player_3fgm + three_fgm_this_season * games_played_this_season
            player_3fga = player_3fga + three_fga_this_season * games_played_this_season
            player_ftm = player_ftm + ftm_this_season * games_played_this_season
            player_fta = player_fta + fta_this_season * games_played_this_season
            player_orb = player_orb + orb_this_season * games_played_this_season
            player_drb = player_drb + drb_this_season * games_played_this_season
            player_trb = player_trb + trb_this_season * games_played_this_season
            player_ast = player_ast + ast_this_season * games_played_this_season
            player_stl = player_stl + stl_this_season * games_played_this_season
            player_blk = player_blk + blk_this_season * games_played_this_season
            player_tov = player_tov + tov_this_season * games_played_this_season
            player_pf = player_pf + pf_this_season * games_played_this_season
            player_pts = player_pts + pts_this_season * games_played_this_season

    # Finding per game averages
    player_games_started_per = np.nan
    player_minutes_per = player_minutes / player_games_played
    player_fgm_per = player_fgm / player_games_played
    player_fga_per = player_fga / player_games_played
    if (player_fgm_per == 0) & (player_fga_per == 0):
        player_fgp_per = 0.0
    else: 
        player_fgp_per = player_fgm_per / player_fga_per
    player_2fgm_per = player_2fgm / player_games_played
    player_2fga_per = player_2fga / player_games_played
    if (player_2fgm_per == 0) & (player_2fga_per == 0):
        player_2fgp_per = 0.0
    else: 
        player_2fgp_per = player_2fgm_per / player_2fga_per
    player_3fgm_per = player_3fgm / player_games_played
    player_3fga_per = player_3fga / player_games_played
    if (player_3fgm_per == 0) & (player_3fga_per == 0):
        player_3fgp_per = 0.0
    else: 
        player_3fgp_per = player_3fgm_per / player_3fga_per
    player_ftm_per = player_ftm / player_games_played
    player_fta_per = player_fta / player_games_played
    if (player_ftm_per == 0) & (player_fta_per == 0):
        player_ftp_per = 0.0
    else: 
        player_ftp_per = player_ftm_per / player_fta_per
    player_orb_per = player_orb / player_games_played
    player_drb_per = player_drb / player_games_played
    player_trb_per = player_trb / player_games_played
    player_ast_per = player_ast / player_games_played
    player_stl_per = player_stl / player_games_played
    player_blk_per = player_blk / player_games_played
    player_tov_per = player_tov / player_games_played
    player_pf_per = player_pf / player_games_played
    player_pts_per = player_pts / player_games_played
    player_sos_per = np.nan

    # Adding to lists
    names.append(name)
    colleges.append(player_college) 
    games_played.append(player_games_played) 
    games_started.append(player_games_started_per) 
    minutes.append(player_minutes_per)
    fgm.append(player_fgm_per) 
    fga.append(player_fga_per) 
    fgp.append(player_fgp_per) 
    two_fgm.append(player_2fgm_per)
    two_fga.append(player_2fga_per)
    two_fgp.append(player_2fgp_per)
    three_fgm.append(player_3fgm_per) 
    three_fga.append(player_3fga_per) 
    three_fgp.append(player_3fgp_per) 
    ftm.append(player_ftm_per) 
    fta.append(player_fta_per)
    ftp.append(player_ftp_per) 
    orb.append(player_orb_per) 
    drb.append(player_drb_per) 
    trb.append(player_trb_per)
    ast.append(player_ast_per) 
    stl.append(player_stl_per) 
    blk.append(player_blk_per) 
    tov.append(player_tov_per) 
    pf.append(player_pf_per) 
    pts.append(player_pts_per)
    sos.append(player_sos_per) 

## Part 2.9 - Gathering G-League Links

In [13]:
g_league_player_links = {
    'Jalen Green': 'https://basketball.realgm.com/player/Jalen-Green/D-League/117342',
    'Jonathan Kuminga': 'https://basketball.realgm.com/player/Jonathan-Kuminga/D-League/127826/2021/By_Season/Per_Game/Regular_Season',
    'Isaiah Todd': 'https://basketball.realgm.com/player/Isaiah-Todd/D-League/117340/2021/By_Season/Per_Game/Regular_Season',
    'Jaden Hardy': 'https://basketball.realgm.com/player/Jaden-Hardy/D-League/126838/2022/By_Season/Per_Game/Full%20Season',
    'Dyson Daniels': 'https://basketball.realgm.com/player/Dyson-Daniels/D-League/151792/2022/By_Season/Per_Game/Full%20Season' 
}

## Part 2.10 - Using G-League Links to Get Stats

In [14]:
for name in list(g_league_player_links.keys()):
    url = g_league_player_links[name]   
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content,"lxml")
    table = soup.find_all('tbody')[0]
    player_league_career_by_year = table.find_all('tr')[0]
    player_college = 'GLeague'
    player_games_played = int(player_league_career_by_year.find_all('td')[2].get_text())
    player_games_started_per = int(player_league_career_by_year.find_all('td')[3].get_text())
    player_minutes_per = float(player_league_career_by_year.find_all('td')[4].get_text())
    player_pts_per = float(player_league_career_by_year.find_all('td')[5].get_text())
    player_fgm_per = float(player_league_career_by_year.find_all('td')[6].get_text())
    player_fga_per = float(player_league_career_by_year.find_all('td')[7].get_text())
    player_fgp_per = float(player_league_career_by_year.find_all('td')[8].get_text())
    player_2fgm_per = np.nan
    player_2fga_per = np.nan
    player_2fgp_per = np.nan
    player_3fgm_per = float(player_league_career_by_year.find_all('td')[9].get_text())
    player_3fga_per = float(player_league_career_by_year.find_all('td')[10].get_text())
    player_3fgp_per = float(player_league_career_by_year.find_all('td')[11].get_text())
    player_ftm_per = float(player_league_career_by_year.find_all('td')[12].get_text())
    player_fta_per = float(player_league_career_by_year.find_all('td')[13].get_text())
    player_ftp_per = float(player_league_career_by_year.find_all('td')[14].get_text())
    player_orb_per = float(player_league_career_by_year.find_all('td')[15].get_text())
    player_drb_per = float(player_league_career_by_year.find_all('td')[16].get_text())
    player_trb_per = float(player_league_career_by_year.find_all('td')[17].get_text())
    player_ast_per = float(player_league_career_by_year.find_all('td')[18].get_text())
    player_stl_per = float(player_league_career_by_year.find_all('td')[19].get_text())
    player_blk_per = float(player_league_career_by_year.find_all('td')[20].get_text())
    player_tov_per = float(player_league_career_by_year.find_all('td')[21].get_text())
    player_pf_per = float(player_league_career_by_year.find_all('td')[22].get_text())
    player_sos_per = np.nan
    
    # Adding G league stats 
    names.append(name)
    colleges.append(player_college) 
    games_played.append(player_games_played) 
    games_started.append(player_games_started_per) 
    minutes.append(player_minutes_per)
    fgm.append(player_fgm_per) 
    fga.append(player_fga_per) 
    fgp.append(player_fgp_per) 
    two_fgm.append(player_2fgm_per)
    two_fga.append(player_2fga_per)
    two_fgp.append(player_2fgp_per)
    three_fgm.append(player_3fgm_per) 
    three_fga.append(player_3fga_per) 
    three_fgp.append(player_3fgp_per) 
    ftm.append(player_ftm_per) 
    fta.append(player_fta_per)
    ftp.append(player_ftp_per) 
    orb.append(player_orb_per) 
    drb.append(player_drb_per) 
    trb.append(player_trb_per)
    ast.append(player_ast_per) 
    stl.append(player_stl_per) 
    blk.append(player_blk_per) 
    tov.append(player_tov_per) 
    pf.append(player_pf_per) 
    pts.append(player_pts_per)
    sos.append(player_sos_per)

## Part 2.11 - Combining Pre-Draft Stats

In [15]:
draft_df['Player_Name'] = [unidecode.unidecode(n) for n in draft_df['Player_Name']]
draft_df

Unnamed: 0,Season,Player_Pick,Player_Team,Player_Name
0,2002,1,WAS,Kwame Brown
1,2002,2,LAC,Tyson Chandler
2,2002,3,ATL,Pau Gasol
3,2002,4,CHI,Eddy Curry
4,2002,5,GSW,Jason Richardson
...,...,...,...,...
1304,2023,54,WAS,Yannick Nzosa
1305,2023,55,GSW,Gui Santos
1306,2023,56,CLE,Luke Travers
1307,2023,57,POR,Jabari Walker


In [16]:
draft_history_df = pd.DataFrame({
    'Player_Name': names, 
    'Player_College': colleges, 
    'Player_College_Games_Played': games_played, 
    'Player_College_Games_Started': games_started, 
    'Player_College_MPG': minutes, 
    'Player_College_FGM': fgm, 
    'Player_College_FGA': fga, 
    'Player_College_FG%': fgp, 
    'Player_College_2FGM': two_fgm,
    'Player_College_2FGA': two_fga, 
    'Player_College_2FG%': two_fgp, 
    'Player_College_3FGM': three_fgm,
    'Player_College_3FGA': three_fga,
    'Player_College_3FG%': three_fgp, 
    'Player_College_FTM': ftm, 
    'Player_College_FTA': fta,
    'Player_College_FT%': ftp, 
    'Player_College_ORB': orb, 
    'Player_College_DRB': drb, 
    'Player_College_TRB': trb, 
    'Player_College_AST': ast,
    'Player_College_STL': stl,
    'Player_College_BLK': blk, 
    'Player_College_TO': tov, 
    'Player_College_PF': pf, 
    'Player_College_PTS': pts, 
    'Player_College_SOS': sos 
})
draft_history_df

Unnamed: 0,Player_Name,Player_College,Player_College_Games_Played,Player_College_Games_Started,Player_College_MPG,Player_College_FGM,Player_College_FGA,Player_College_FG%,Player_College_2FGM,Player_College_2FGA,...,Player_College_ORB,Player_College_DRB,Player_College_TRB,Player_College_AST,Player_College_STL,Player_College_BLK,Player_College_TO,Player_College_PF,Player_College_PTS,Player_College_SOS
0,Jason Richardson,Michigan State,70,35.0,21.7,3.7,7.4,0.503,2.9,5.3,...,,,5.0,1.4,0.8,0.5,1.1,1.9,9.6,9.73
1,Shane Battier,Duke,146,120.0,29.7,4.5,8.9,0.500,2.8,4.9,...,,,6.1,1.6,1.8,1.7,1.1,2.0,13.6,10.07
2,Eddie Griffin,Seton Hall,30,30.0,32.6,6.9,16.0,0.429,5.5,11.7,...,3.1,7.6,10.8,1.6,0.9,4.4,2.2,2.4,17.8,6.03
3,Rodney White,Charlotte,28,26.0,30.9,6.7,13.8,0.487,5.5,10.4,...,1.8,4.7,6.5,1.5,1.2,0.7,2.9,2.6,18.7,7.91
4,Joe Johnson,Arkansas,53,41.0,30.3,5.7,12.2,0.466,4.5,9.3,...,,,6.1,2.4,1.7,0.5,2.5,1.7,15.0,6.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167,Jalen Green,GLeague,15,15.0,32.0,6.3,13.6,0.461,,,...,0.5,3.5,4.1,2.8,1.5,0.3,2.7,1.7,17.9,
1168,Jonathan Kuminga,GLeague,13,13.0,32.8,5.5,14.3,0.387,,,...,1.2,6.0,7.2,2.7,1.0,0.8,2.6,2.1,15.8,
1169,Isaiah Todd,GLeague,15,2.0,24.4,4.9,11.1,0.437,,,...,0.8,4.1,4.9,0.8,0.5,0.7,1.5,2.5,12.3,
1170,Jaden Hardy,GLeague,25,25.0,32.6,6.9,18.3,0.376,,,...,0.6,3.7,4.3,3.6,1.2,0.2,3.4,2.4,19.5,


In [17]:
draft_df_final = draft_df.merge(draft_history_df, how = 'left', left_on = 'Player_Name', right_on = 'Player_Name')
draft_df_final = draft_df_final.dropna(subset = ['Player_College']).drop(['Player_College_Games_Started', 'Player_College_ORB', 'Player_College_DRB', 'Player_College_SOS'], axis = 1)
draft_df_final

Unnamed: 0,Season,Player_Pick,Player_Team,Player_Name,Player_College,Player_College_Games_Played,Player_College_MPG,Player_College_FGM,Player_College_FGA,Player_College_FG%,...,Player_College_FTM,Player_College_FTA,Player_College_FT%,Player_College_TRB,Player_College_AST,Player_College_STL,Player_College_BLK,Player_College_TO,Player_College_PF,Player_College_PTS
2,2002,3,ATL,Pau Gasol,International,64.0,19.781250,3.246875,5.828125,0.557105,...,1.946875,3.268750,0.595602,4.087500,0.659375,0.862500,0.800000,1.025000,1.500000,8.878125
4,2002,5,GSW,Jason Richardson,Michigan State,70.0,21.700000,3.700000,7.400000,0.503000,...,1.400000,2.100000,0.649000,5.000000,1.400000,0.800000,0.500000,1.100000,1.900000,9.600000
5,2002,6,MEM,Shane Battier,Duke,146.0,29.700000,4.500000,8.900000,0.500000,...,3.000000,3.800000,0.777000,6.100000,1.600000,1.800000,1.700000,1.100000,2.000000,13.600000
6,2002,7,BKN,Eddie Griffin,Seton Hall,30.0,32.600000,6.900000,16.000000,0.429000,...,2.700000,3.600000,0.734000,10.800000,1.600000,0.900000,4.400000,2.200000,2.400000,17.800000
8,2002,9,DET,Rodney White,Charlotte,28.0,30.900000,6.700000,13.800000,0.487000,...,4.100000,5.700000,0.713000,6.500000,1.500000,1.200000,0.700000,2.900000,2.600000,18.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,2023,53,BOS,JD Davison,Alabama,33.0,25.800000,3.000000,6.500000,0.463000,...,1.800000,2.500000,0.728000,4.800000,4.300000,1.000000,0.400000,2.900000,1.200000,8.500000
1310,2023,54,WAS,Yannick Nzosa,International,68.0,11.408824,1.516176,2.852941,0.531443,...,0.438235,0.595588,0.735802,2.375000,0.257353,0.404412,0.754412,0.632353,2.001471,3.427941
1312,2023,56,CLE,Luke Travers,International,71.0,15.802817,2.295775,4.925352,0.466114,...,0.888732,1.229577,0.722795,3.957746,1.454930,0.584507,0.491549,1.171831,0.905634,5.800000
1313,2023,57,POR,Jabari Walker,Colorado,59.0,22.000000,3.900000,8.100000,0.479000,...,2.700000,3.500000,0.783000,7.200000,0.900000,0.600000,0.600000,1.800000,2.300000,11.500000


## Part 2.12 - Getting Team Stats 

In [18]:
advanced_team_dict = {
 'Atlanta': 'ATL',
 'Boston': 'BOS',
 'Charlotte': 'CHA',
 'Chicago': 'CHI',
 'Cleveland': 'CLE',
 'Dallas': 'DAL',
 'Denver': 'DEN',
 'Detroit': 'DET',
 'Golden State': 'GSW',
 'Houston': 'HOU',
 'Indiana': 'IND',
 'LA Clippers': 'LAC',
 'LA Lakers': 'LAL',
 'Miami': 'MIA',
 'Milwaukee': 'MIL',
 'Minnesota': 'MIN',
 'New Jersey': 'BKN',
 'New York': 'NYK',
 'Orlando': 'ORL',
 'Philadelphia': 'PHI',
 'Phoenix': 'PHX',
 'Portland': 'POR',
 'Sacramento': 'SAC',
 'San Antonio': 'SAS',
 'Seattle': 'OKC',
 'Utah': 'UTA',
 'Washington': 'WAS',
 'Toronto': 'TOR',
 'Vancouver': 'MEM',
 'Memphis': 'MEM',
 'New Orleans': 'NOP',
 'NO/Oklahoma City\r\n': 'NOP',
 'NO/Oklahoma City': 'NOP',
 'Oklahoma City': 'OKC',
 'Brooklyn': 'BKN'
}

In [19]:
advanced_urls = []
url_base = 'http://www.espn.com/nba/hollinger/teamstats/_/sort/paceFactor/year/' 
for j in range(2001,2023): #2017 --> 2016-2017
    advanced_urls.append(url_base + str(j))
    
advanced_dict = {
    'Season': [],
    'Player_Team': [],
    'Team_Pace': [],
    'Team_AST': [],
    'Team_TO': [],
    'Team_ORR': [],
    'Team_DRR': [],
    'Team_REBR': [],
    'Team_EFG': [],
    'Team_TS%': [],
    'Team_OEF': [],
    'Team_DEF': []   
}
advanced_df = pd.DataFrame(advanced_dict)

for year_url in advanced_urls:
    year_string = int(year_url[-4:])
   
    year_page = requests.get(year_url, headers=headers)
    year_soup = BeautifulSoup(year_page.content,"lxml")
    year_html = year_soup.find('table', {'class':'tablehead'}).find_all('tr')[2:]
    for tr in year_html:
        stats = tr.find_all('td')
        current_team = advanced_team_dict[stats[1].get_text()]
        team_dict = {
            'Season': year_string,
            'Player_Team': current_team,
            'Team_Pace': float(stats[2].get_text()),
            'Team_AST': float(stats[3].get_text()),
            'Team_TO': float(stats[4].get_text()),
            'Team_ORR': float(stats[5].get_text()),
            'Team_DRR': float(stats[6].get_text()),
            'Team_REBR': float(stats[7].get_text()),
            'Team_EFG': float(stats[8].get_text()),
            'Team_TS%': float(stats[9].get_text()),
            'Team_OEF': float(stats[10].get_text()),
            'Team_DEF': float(stats[11].get_text())   
        }
        team_df = pd.DataFrame(team_dict, index = [0])
        advanced_df = pd.concat([advanced_df, team_df], ignore_index = True)
        advanced_df.reset_index()
        
display(advanced_df)        

Unnamed: 0,Season,Player_Team,Team_Pace,Team_AST,Team_TO,Team_ORR,Team_DRR,Team_REBR,Team_EFG,Team_TS%,Team_OEF,Team_DEF
0,2001.0,ATL,0.0,13.3,26.8,0.0,0.0,0.0,45.6,50.0,81.3,0.0
1,2001.0,BOS,0.0,14.6,25.7,0.0,0.0,0.0,47.3,52.1,85.1,0.0
2,2001.0,CHA,0.0,16.2,24.1,0.0,0.0,0.0,45.7,50.6,85.9,0.0
3,2001.0,CHI,0.0,15.5,26.3,0.0,0.0,0.0,45.0,49.5,80.6,0.0
4,2001.0,CLE,0.0,14.4,26.7,0.0,0.0,0.0,45.9,50.9,83.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
651,2022.0,PHI,98.4,18.1,11.6,20.1,0.0,48.9,53.4,57.8,111.0,108.3
652,2022.0,CLE,98.4,18.8,13.2,24.0,0.0,50.8,53.8,57.1,109.4,107.0
653,2022.0,NYK,98.4,16.6,12.0,25.1,0.0,51.4,51.3,55.0,108.0,107.6
654,2022.0,MIA,98.2,19.0,13.4,23.5,0.0,51.2,54.7,58.4,111.2,106.3


In [20]:
# Add empty stats for 2023

for t in list(advanced_df.Player_Team.unique()):
    team_dict = {
        'Season': 2023,
        'Player_Team': t,
        'Team_Pace': 0,
        'Team_AST': 0,
        'Team_TO': 0,
        'Team_ORR': 0,
        'Team_DRR': 0,
        'Team_REBR': 0,
        'Team_EFG': 0,
        'Team_TS%': 0,
        'Team_OEF': 0,
        'Team_DEF': 0   
    }
    team_df = pd.DataFrame(team_dict, index = [0])
    advanced_df = pd.concat([advanced_df, team_df], ignore_index = True)
    advanced_df.reset_index()
display(advanced_df)  

Unnamed: 0,Season,Player_Team,Team_Pace,Team_AST,Team_TO,Team_ORR,Team_DRR,Team_REBR,Team_EFG,Team_TS%,Team_OEF,Team_DEF
0,2001.0,ATL,0.0,13.3,26.8,0.0,0.0,0.0,45.6,50.0,81.3,0.0
1,2001.0,BOS,0.0,14.6,25.7,0.0,0.0,0.0,47.3,52.1,85.1,0.0
2,2001.0,CHA,0.0,16.2,24.1,0.0,0.0,0.0,45.7,50.6,85.9,0.0
3,2001.0,CHI,0.0,15.5,26.3,0.0,0.0,0.0,45.0,49.5,80.6,0.0
4,2001.0,CLE,0.0,14.4,26.7,0.0,0.0,0.0,45.9,50.9,83.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
681,2023.0,UTA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
682,2023.0,WAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
683,2023.0,TOR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
684,2023.0,MEM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Part 3 - Lagging Data For Each Player

In [21]:
# Fix unicode issues
player_info_df['Player_Name'] = [unidecode.unidecode(n) for n in player_info_df['Player_Name']]

In [22]:
# Gathering indexes 1) to remove multiple rows when a player is trades and only look at totals 2) to remove totals and 
# look at individual totals for each team played for

totals = player_info_df[player_info_df['Player_Team'] == 'TOT']
totals_combos = []
for i in range(len(totals)):
    totals_combos.append([totals.iloc[i,0], totals.iloc[i,1]])
    
history_indexes_to_remove = []
team_indexes_to_remove = [] # Not needed for right now
for j in range(len(player_info_df)):
    year = player_info_df.iloc[j,0]
    name = player_info_df.iloc[j,1]
    tuple_1 = [year , name]
    if tuple_1 in totals_combos:
        team = player_info_df.iloc[j,5]
        if team != 'TOT':
            history_indexes_to_remove.append(j)
        else:
            team_indexes_to_remove.append(j) # Not needed for right now
historics_df = player_info_df.iloc[~player_info_df.index.isin(history_indexes_to_remove)]
teams_df = player_info_df.iloc[~player_info_df.index.isin(team_indexes_to_remove)]

In [23]:
## Add in 2 Year Lags 

entities_to_lag = list(historics_df.columns[4:])
entities_to_lag.remove('Player_Team')
for e in entities_to_lag:
    historics_df[f'1_year_ago_{e}'] = historics_df.groupby('Player_Name')[e].shift(1)
    historics_df[f'2_years_ago_{e}'] = historics_df.groupby('Player_Name')[e].shift(2)

# Part 4 - Positional Averages Last Year

In [24]:
Player_Age_F = []
Player_Age_G = []
Player_Age_C = []
Player_Games_Played_F = []
Player_Games_Played_G = []
Player_Games_Played_C = []
Player_Games_Started_F = []
Player_Games_Started_G = []
Player_Games_Started_C = []
Player_Minutes_F = []
Player_Minutes_G = []
Player_Minutes_C = []
Player_Points_F = []
Player_Points_G = []
Player_Points_C = []
Player_FGM_F = []
Player_FGM_G = []
Player_FGM_C = []
Player_FGA_F = []
Player_FGA_G = []
Player_FGA_C = []
Player_FGperc_F = []
Player_FGperc_G = []
Player_FGperc_C = []
Player_3FGM_F = []
Player_3FGM_G = []
Player_3FGM_C = []
Player_3FGA_F = []
Player_3FGA_G = []
Player_3FGA_C = []
Player_3FGperc_F = []
Player_3FGperc_G = []
Player_3FGperc_C = []
Player_2FGM_F = []
Player_2FGM_G = []
Player_2FGM_C = []
Player_2FGA_F = []
Player_2FGA_G = []
Player_2FGA_C = []
Player_2FGperc_F = []
Player_2FGperc_G = []
Player_2FGperc_C = []
Player_EFGperc_F = []
Player_EFGperc_G = []
Player_EFGperc_C = []
Player_FTM_F = []
Player_FTM_G = []
Player_FTM_C = []
Player_FTA_F = []
Player_FTA_G = []
Player_FTA_C = []
Player_FTperc_F = []
Player_FTperc_G = []
Player_FTperc_C = []
Player_ORB_F = []
Player_ORB_G = []
Player_ORB_C = []
Player_DRB_F = []
Player_DRB_G = []
Player_DRB_C = []
Player_TRB_F = []
Player_TRB_G = []
Player_TRB_C = []
Player_Ast_F = []
Player_Ast_G = []
Player_Ast_C = []
Player_STL_F = []
Player_STL_G = []
Player_STL_C = []
Player_BLK_F = []
Player_BLK_G = []
Player_BLK_C = []
Player_TO_F = []
Player_TO_G = []
Player_TO_C = []
Player_PF_F = []
Player_PF_G = []
Player_PF_C = []
Player_Pts_F = []
Player_Pts_G = []
Player_Pts_C = []

In [25]:
for r in range(len(historics_df)):
    sample_row_name = historics_df.iloc[r,1]
    sample_row_current_season = historics_df.iloc[r,0] 
    sample_row_current_team = historics_df.iloc[r,5]
    sample_row_current_players = list(historics_df[(historics_df['Player_Team'] == sample_row_current_team) & \
                                              (historics_df['Season'] == sample_row_current_season)]['Player_Name'])
    sample_row_current_players.remove(sample_row_name)
    sample_row_prior_year_df = historics_df[(historics_df['Player_Name'].isin(sample_row_current_players)) & \
                                           (historics_df['Season'] == sample_row_current_season - 1)].iloc[:,:32]

    sample_row_prior_year_df['Positional_Games'] = sample_row_prior_year_df['Player_Games_Played'].groupby(sample_row_prior_year_df['Player_Position']).transform('sum')
    sample_row_prior_year_df['Player_Weight'] = sample_row_prior_year_df['Player_Games_Played'] / sample_row_prior_year_df['Positional_Games']
    index = 0
    for e in entities_to_lag:
        for p in ('F', 'G', 'C'):
            sample_row_prior_year_df_position = sample_row_prior_year_df[sample_row_prior_year_df['Player_Position'] == p]
            value = sum(sample_row_prior_year_df_position[e] * sample_row_prior_year_df_position['Player_Weight'])
            if index == 0:
                Player_Age_F.append(value)
            elif index == 1:
                Player_Age_G.append(value)
            elif index == 2:
                Player_Age_C.append(value)
            elif index == 3:
                Player_Games_Played_F.append(value)
            elif index == 4:
                Player_Games_Played_G.append(value)
            elif index == 5:
                Player_Games_Played_C.append(value)
            elif index == 6:
                Player_Games_Started_F.append(value)
            elif index == 7:
                Player_Games_Started_G.append(value)
            elif index == 8:
                Player_Games_Started_C.append(value)
            elif index == 9:
                Player_Minutes_F.append(value)
            elif index == 10:
                Player_Minutes_G.append(value)
            elif index == 11:
                Player_Minutes_C.append(value)
            elif index == 12:
                Player_Points_F.append(value)
            elif index == 13:
                Player_Points_G.append(value)
            elif index == 14:
                Player_Points_C.append(value)
            elif index == 15:
                Player_FGM_F.append(value)
            elif index == 16:
                Player_FGM_G.append(value)
            elif index == 17:
                Player_FGM_C.append(value)
            elif index == 18:
                Player_FGA_F.append(value)
            elif index == 19:
                Player_FGA_G.append(value)
            elif index == 20:
                Player_FGA_C.append(value)
            elif index == 21:
                Player_FGperc_F.append(value)
            elif index == 22:
                Player_FGperc_G.append(value)
            elif index == 23:
                Player_FGperc_C.append(value)
            elif index == 24:
                Player_3FGM_F.append(value)
            elif index == 25:
                Player_3FGM_G.append(value)
            elif index == 26:
                Player_3FGM_C.append(value)
            elif index == 27:
                Player_3FGA_F.append(value)
            elif index == 28:
                Player_3FGA_G.append(value)
            elif index == 29:
                Player_3FGA_C.append(value)
            elif index == 30:
                Player_3FGperc_F.append(value)
            elif index == 31:
                Player_3FGperc_G.append(value)
            elif index == 32:
                Player_3FGperc_C.append(value)
            elif index == 33:
                Player_2FGM_F.append(value)
            elif index == 34:
                Player_2FGM_G.append(value)
            elif index == 35:
                Player_2FGM_C.append(value)
            elif index == 36:
                Player_2FGA_F.append(value)
            elif index == 37:
                Player_2FGA_G.append(value)
            elif index == 38:
                Player_2FGA_C.append(value)
            elif index == 39:
                Player_2FGperc_F.append(value)
            elif index == 40:
                Player_2FGperc_G.append(value)
            elif index == 41:
                Player_2FGperc_C.append(value)
            elif index == 42:
                Player_EFGperc_F.append(value)
            elif index == 43:
                Player_EFGperc_G.append(value)
            elif index == 44:
                Player_EFGperc_C.append(value)
            elif index == 45:
                Player_FTM_F.append(value)
            elif index == 46:
                Player_FTM_G.append(value)
            elif index == 47:
                Player_FTM_C.append(value)
            elif index == 48:
                Player_FTA_F.append(value)
            elif index == 49:
                Player_FTA_G.append(value)
            elif index == 50:
                Player_FTA_C.append(value)
            elif index == 51:
                Player_FTperc_F.append(value)
            elif index == 52:
                Player_FTperc_G.append(value)
            elif index == 53:
                Player_FTperc_C.append(value)
            elif index == 54:
                Player_ORB_F.append(value)
            elif index == 55:
                Player_ORB_G.append(value)
            elif index == 56:
                Player_ORB_C.append(value)
            elif index == 57:
                Player_DRB_F.append(value)
            elif index == 58:
                Player_DRB_G.append(value)
            elif index == 59:
                Player_DRB_C.append(value)
            elif index == 60:
                Player_TRB_F.append(value)
            elif index == 61:
                Player_TRB_G.append(value)
            elif index == 62:
                Player_TRB_C.append(value)
            elif index == 63:
                Player_Ast_F.append(value)
            elif index == 64:
                Player_Ast_G.append(value)
            elif index == 65:
                Player_Ast_C.append(value)
            elif index == 66:
                Player_STL_F.append(value)
            elif index == 67:
                Player_STL_G.append(value)
            elif index == 68:
                Player_STL_C.append(value)
            elif index == 69:
                Player_BLK_F.append(value)
            elif index == 70:
                Player_BLK_G.append(value)
            elif index == 71:
                Player_BLK_C.append(value)
            elif index == 72:
                Player_TO_F.append(value)
            elif index == 73:
                Player_TO_G.append(value)
            elif index == 74:
                Player_TO_C.append(value)
            elif index == 75:
                Player_PF_F.append(value)
            elif index == 76:
                Player_PF_G.append(value)
            elif index == 77:
                Player_PF_C.append(value)
            elif index == 78:
                Player_Pts_F.append(value)
            elif index == 79:
                Player_Pts_G.append(value)
            elif index == 80:
                Player_Pts_C.append(value)
            index = index + 1

In [26]:
historics_df['Player_Age_F'] = Player_Age_F
historics_df['Player_Age_G'] = Player_Age_G
historics_df['Player_Age_C'] = Player_Age_C
historics_df['Player_Games_Played_F'] = Player_Games_Played_F
historics_df['Player_Games_Played_G'] = Player_Games_Played_G
historics_df['Player_Games_Played_C'] = Player_Games_Played_C
historics_df['Player_Games_Started_F'] = Player_Games_Started_F
historics_df['Player_Games_Started_G'] = Player_Games_Started_G
historics_df['Player_Games_Started_C'] = Player_Games_Started_C
historics_df['Player_Minutes_F'] = Player_Minutes_F
historics_df['Player_Minutes_G'] = Player_Minutes_G
historics_df['Player_Minutes_C'] = Player_Minutes_C
historics_df['Player_Points_F'] = Player_Points_F
historics_df['Player_Points_G'] = Player_Points_G
historics_df['Player_Points_C'] = Player_Points_C
historics_df['Player_FGM_F'] = Player_FGM_F
historics_df['Player_FGM_G'] = Player_FGM_G
historics_df['Player_FGM_C'] = Player_FGM_C
historics_df['Player_FGA_F'] = Player_FGA_F
historics_df['Player_FGA_G'] = Player_FGA_G
historics_df['Player_FGA_C'] = Player_FGA_C
historics_df['Player_FGperc_F'] = Player_FGperc_F
historics_df['Player_FGperc_G']= Player_FGperc_G
historics_df['Player_FGperc_C'] = Player_FGperc_C
historics_df['Player_3FGM_F'] = Player_3FGM_F
historics_df['Player_3FGM_G'] = Player_3FGM_G
historics_df['Player_3FGM_C'] = Player_3FGM_C
historics_df['Player_3FGA_F'] = Player_3FGA_F
historics_df['Player_3FGA_G'] = Player_3FGA_G
historics_df['Player_3FGA_C'] = Player_3FGA_C
historics_df['Player_3FGperc_F'] = Player_3FGperc_F
historics_df['Player_3FGperc_G'] = Player_3FGperc_G
historics_df['Player_3FGperc_C'] = Player_3FGperc_C
historics_df['Player_2FGM_F'] = Player_2FGM_F
historics_df['Player_2FGM_G'] = Player_2FGM_G
historics_df['Player_2FGM_C'] = Player_2FGM_C
historics_df['Player_2FGA_F'] = Player_2FGA_F
historics_df['Player_2FGA_G'] = Player_2FGA_G
historics_df['Player_2FGA_C'] = Player_2FGA_C
historics_df['Player_2FGperc_F'] = Player_2FGperc_F
historics_df['Player_2FGperc_G'] = Player_2FGperc_G
historics_df['Player_2FGperc_C'] = Player_2FGperc_C
historics_df['Player_EFGperc_F'] = Player_EFGperc_F
historics_df['Player_EFGperc_G'] = Player_EFGperc_G
historics_df['Player_EFGperc_C'] = Player_EFGperc_C
historics_df['Player_FTM_F'] = Player_FTM_F
historics_df['Player_FTM_G'] = Player_FTM_G
historics_df['Player_FTM_C'] = Player_FTM_C
historics_df['Player_FTA_F'] = Player_FTA_F
historics_df['Player_FTA_G'] = Player_FTA_G
historics_df['Player_FTA_C'] = Player_FTA_C
historics_df['Player_FTperc_F'] = Player_FTperc_F
historics_df['Player_FTperc_G'] = Player_FTperc_G
historics_df['Player_FTperc_C'] = Player_FTperc_C
historics_df['Player_ORB_F'] = Player_ORB_F
historics_df['Player_ORB_G'] = Player_ORB_G
historics_df['Player_ORB_C'] = Player_ORB_C
historics_df['Player_DRB_F'] = Player_DRB_F
historics_df['Player_DRB_G'] = Player_DRB_G
historics_df['Player_DRB_C'] = Player_DRB_C
historics_df['Player_TRB_F'] = Player_TRB_F
historics_df['Player_TRB_G'] = Player_TRB_G
historics_df['Player_TRB_C'] = Player_TRB_C
historics_df['Player_Ast_F'] = Player_Ast_F
historics_df['Player_Ast_G'] = Player_Ast_G
historics_df['Player_Ast_C'] = Player_Ast_C
historics_df['Player_STL_F'] = Player_STL_F
historics_df['Player_STL_G'] = Player_STL_G
historics_df['Player_STL_C'] = Player_STL_C
historics_df['Player_BLK_F'] = Player_BLK_F
historics_df['Player_BLK_G'] = Player_BLK_G
historics_df['Player_BLK_C'] = Player_BLK_C
historics_df['Player_TO_F'] = Player_TO_F
historics_df['Player_TO_G'] = Player_TO_G
historics_df['Player_TO_C'] = Player_TO_C
historics_df['Player_PF_F'] = Player_PF_F
historics_df['Player_PF_G'] = Player_PF_G
historics_df['Player_PF_C'] = Player_PF_C
historics_df['Player_Pts_F'] = Player_Pts_F
historics_df['Player_Pts_G'] = Player_Pts_G
historics_df['Player_Pts_C'] = Player_Pts_C

# Part 5 - Combining Historical Dataframe with Team Dataframe

In [27]:
team_stats = list(advanced_df.columns[2:])
for e in team_stats:
    advanced_df[f'1_year_ago_{e}'] = advanced_df.groupby(['Player_Team'])[e].shift(1)

In [28]:
past_team_stats = advanced_df.drop(['Team_Pace', 'Team_AST', 'Team_TO', 'Team_ORR', 'Team_DRR', 'Team_REBR',
       'Team_EFG', 'Team_TS%', 'Team_OEF', 'Team_DEF'], axis = 1)
past_team_stats

Unnamed: 0,Season,Player_Team,1_year_ago_Team_Pace,1_year_ago_Team_AST,1_year_ago_Team_TO,1_year_ago_Team_ORR,1_year_ago_Team_DRR,1_year_ago_Team_REBR,1_year_ago_Team_EFG,1_year_ago_Team_TS%,1_year_ago_Team_OEF,1_year_ago_Team_DEF
0,2001.0,ATL,,,,,,,,,,
1,2001.0,BOS,,,,,,,,,,
2,2001.0,CHA,,,,,,,,,,
3,2001.0,CHI,,,,,,,,,,
4,2001.0,CLE,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
681,2023.0,UTA,99.5,16.9,12.7,25.4,0.0,52.7,55.5,58.9,114.0,107.7
682,2023.0,WAS,99.3,18.7,12.1,20.9,0.0,49.4,53.2,56.8,108.9,111.7
683,2023.0,TOR,98.6,16.3,11.0,28.4,0.0,50.7,51.0,54.3,109.6,107.9
684,2023.0,MEM,103.1,18.1,11.2,30.0,0.0,53.3,52.2,55.3,111.6,106.2


In [29]:
final_data = historics_df.merge(past_team_stats, on = ['Season', 'Player_Team'], how = 'left')

In [30]:
final_data['1_year_ago_Player_Age_Sq'] = final_data['1_year_ago_Player_Age']**2
final_data['2_years_ago_Player_Age_Sq'] = final_data['2_years_ago_Player_Age']**2

# Part 6 - Separate Players into 3 classes

In [31]:
rookies = []
one_year_players = []
two_plus_year_players = []

current_players = list(final_data[final_data['Season'] == 2023]['Player_Name'])

for c in current_players:
    number_seasons = len(final_data[final_data['Player_Name'] == c])
    if number_seasons == 1:
        rookies.append(c)
    elif number_seasons == 2:
        one_year_players.append(c)
    else:
        two_plus_year_players.append(c)

# Part 7 - Modeling - 2+ Years

## Part 7.1 - Filter Data

In [32]:
# Filter to not include this year
training_df = final_data[(final_data['Season'] >= 2004) & (final_data['Season'] <= 2022)]

# Filter to not include multi-team players
training_df = training_df[training_df['Player_Team'] != 'TOT']

# Drop NaN values
training_df = training_df.dropna().reset_index().drop('index', axis = 1)

# Convert Position to Categorical
training_df['Player_Position'] = training_df['Player_Position'].astype('category')
training_df['Player_Position_C'] = pd.get_dummies(training_df['Player_Position']).astype('bool').iloc[:, 0]
training_df['Player_Position_F'] = pd.get_dummies(training_df['Player_Position']).astype('bool').iloc[:, 1]
training_df['Player_Position_G'] = pd.get_dummies(training_df['Player_Position']).astype('bool').iloc[:, 2]
training_df = training_df.drop('Player_Position', axis = 1)

training_df.head(5)

Unnamed: 0,Season,Player_Name,Player_URL,Player_Age,Player_Team,Player_Games_Played,Player_Games_Started,Player_Minutes,Player_Points,Player_FGM,...,1_year_ago_Team_REBR,1_year_ago_Team_EFG,1_year_ago_Team_TS%,1_year_ago_Team_OEF,1_year_ago_Team_DEF,1_year_ago_Player_Age_Sq,2_years_ago_Player_Age_Sq,Player_Position_C,Player_Position_F,Player_Position_G
0,2004,Malik Allen,https://www.basketball-reference.com/players/a...,25.0,MIA,45.0,6.0,13.7,4.2,1.8,...,49.7,43.9,48.3,81.1,86.2,576.0,529.0,False,True,False
1,2004,Ray Allen*,https://www.basketball-reference.com/players/a...,28.0,OKC,56.0,56.0,38.4,23.0,8.0,...,49.6,47.2,51.2,88.0,86.9,729.0,676.0,False,False,True
2,2004,Rafer Alston,https://www.basketball-reference.com/players/a...,27.0,MIA,82.0,28.0,31.5,10.2,3.5,...,49.7,43.9,48.3,81.1,86.2,676.0,625.0,False,False,True
3,2004,Chris Andersen,https://www.basketball-reference.com/players/a...,25.0,DEN,71.0,0.0,14.5,3.4,1.3,...,51.3,42.8,46.9,74.4,83.6,576.0,529.0,False,True,False
4,2004,Derek Anderson,https://www.basketball-reference.com/players/a...,29.0,POR,51.0,46.0,35.5,13.6,4.5,...,51.3,48.9,53.1,88.2,85.3,784.0,729.0,False,False,True


In [33]:
class2_predictions = final_data[(final_data['Player_Name'].isin(two_plus_year_players)) & (final_data['Season'] == 2023)]
class2_predictions['Player_Position'] = class2_predictions['Player_Position'].astype('category')
class2_predictions['Player_Position_C'] = pd.get_dummies(class2_predictions['Player_Position']).astype('bool').iloc[:, 0]
class2_predictions['Player_Position_F'] = pd.get_dummies(class2_predictions['Player_Position']).astype('bool').iloc[:, 1]
class2_predictions['Player_Position_G'] = pd.get_dummies(class2_predictions['Player_Position']).astype('bool').iloc[:, 2]
#class2_predictions = class2_predictions.drop('Player_Position', axis = 1)

## Part 7.2 - Models

### PPG

In [34]:
y = list(training_df['Player_Points'])
training_cols = list(training_df.columns[31:])
x = training_df.loc[:,training_cols]
x_pred = class2_predictions.loc[:,training_cols]



# Random Forest
rf_class2_ppg = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_ppg.fit(x, y)
rf_class2_ppg_results = list(rf_class2_ppg.predict(x_pred))

# XG Boost
xgb_class2_ppg = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_ppg.fit(x, y)
xgb_class2_ppg_results = list(xgb_class2_ppg.predict(x_pred))

# Linear Regression

lr_class2_ppg = LinearRegression().fit(x, y)
lr_class2_ppg_results = list(xgb_class2_ppg.predict(x_pred))

# Ensemble
class2_ppg = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_ppg_results, xgb_class2_ppg_results, lr_class2_ppg_results)]

In [35]:
# Initialize New Projection DataFrame

predictions_2023 = pd.DataFrame({'Player_Name': class2_predictions.Player_Name, \
                                'Player_Team': class2_predictions.Player_Team, \
                                'Player_Position': class2_predictions.Player_Position,
                                'PPG': class2_ppg})
predictions_2023

Unnamed: 0,Player_Name,Player_Team,Player_Position,PPG
10106,Precious Achiuwa,TOR,F,9.23
10107,Steven Adams,MEM,C,5.07
10108,Bam Adebayo,MIA,C,22.12
10111,Nickeil Alexander-Walker,UTA,G,11.38
10112,Grayson Allen,MIL,G,12.14
...,...,...,...,...
10664,Christian Wood,DAL,F,18.46
10665,Delon Wright,WAS,G,5.93
10668,Thaddeus Young,TOR,F,3.58
10669,Trae Young,ATL,G,27.55


### APG

In [36]:
y = list(training_df['Player_Ast'])


# Random Forest
rf_class2_apg = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_apg.fit(x, y)
rf_class2_apg_results = list(rf_class2_apg.predict(x_pred))

# XG Boost
xgb_class2_apg = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_apg.fit(x, y)
xgb_class2_apg_results = list(xgb_class2_apg.predict(x_pred))

# Linear Regression

lr_class2_apg = LinearRegression().fit(x, y)
lr_class2_apg_results = list(xgb_class2_apg.predict(x_pred))

# Ensemble
class2_apg = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_apg_results, xgb_class2_apg_results, lr_class2_apg_results)]
predictions_2023['APG'] = class2_apg

### ORB

In [37]:
y = list(training_df['Player_ORB'])


# Random Forest
rf_class2_orb = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_orb.fit(x, y)
rf_class2_orb_results = list(rf_class2_orb.predict(x_pred))

# XG Boost
xgb_class2_orb = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_orb.fit(x, y)
xgb_class2_orb_results = list(xgb_class2_orb.predict(x_pred))

# Linear Regression

lr_class2_orb = LinearRegression().fit(x, y)
lr_class2_orb_results = list(xgb_class2_orb.predict(x_pred))

# Ensemble
class2_orb = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_orb_results, xgb_class2_orb_results, lr_class2_orb_results)]
predictions_2023['ORB'] = class2_orb

### DRB

In [38]:
y = list(training_df['Player_DRB'])


# Random Forest
rf_class2_drb = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_drb.fit(x, y)
rf_class2_drb_results = list(rf_class2_drb.predict(x_pred))

# XG Boost
xgb_class2_drb = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_drb.fit(x, y)
xgb_class2_drb_results = list(xgb_class2_drb.predict(x_pred))

# Linear Regression

lr_class2_drb = LinearRegression().fit(x, y)
lr_class2_drb_results = list(xgb_class2_drb.predict(x_pred))

# Ensemble
class2_drb = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_drb_results, xgb_class2_drb_results, lr_class2_drb_results)]
predictions_2023['DRB'] = class2_drb
predictions_2023['TRB'] = predictions_2023['DRB'] + predictions_2023['ORB']

### STL

In [39]:
y = list(training_df['Player_STL'])


# Random Forest
rf_class2_stl = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_stl.fit(x, y)
rf_class2_stl_results = list(rf_class2_stl.predict(x_pred))

# XG Boost
xgb_class2_stl = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_stl.fit(x, y)
xgb_class2_stl_results = list(xgb_class2_stl.predict(x_pred))

# Linear Regression

lr_class2_stl = LinearRegression().fit(x, y)
lr_class2_stl_results = list(xgb_class2_stl.predict(x_pred))

# Ensemble
class2_stl = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_stl_results, xgb_class2_stl_results, lr_class2_stl_results)]
predictions_2023['STL'] = class2_stl

### BLK

In [40]:
y = list(training_df['Player_BLK'])


# Random Forest
rf_class2_blk = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_blk.fit(x, y)
rf_class2_blk_results = list(rf_class2_blk.predict(x_pred))

# XG Boost
xgb_class2_blk = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_blk.fit(x, y)
xgb_class2_blk_results = list(xgb_class2_blk.predict(x_pred))

# Linear Regression

lr_class2_blk = LinearRegression().fit(x, y)
lr_class2_blk_results = list(xgb_class2_blk.predict(x_pred))

# Ensemble
class2_blk = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_blk_results, xgb_class2_blk_results, lr_class2_blk_results)]
predictions_2023['BLK'] = class2_blk

### TOV

In [41]:
y = list(training_df['Player_TO'])


# Random Forest
rf_class2_to = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_to.fit(x, y)
rf_class2_to_results = list(rf_class2_to.predict(x_pred))

# XG Boost
xgb_class2_to = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_to.fit(x, y)
xgb_class2_to_results = list(xgb_class2_to.predict(x_pred))

# Linear Regression

lr_class2_to = LinearRegression().fit(x, y)
lr_class2_to_results = list(xgb_class2_to.predict(x_pred))

# Ensemble
class2_to = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_to_results, xgb_class2_to_results, lr_class2_to_results)]
predictions_2023['TO'] = class2_to

### PF

In [42]:
y = list(training_df['Player_PF'])


# Random Forest
rf_class2_pf = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_pf.fit(x, y)
rf_class2_pf_results = list(rf_class2_pf.predict(x_pred))

# XG Boost
xgb_class2_pf = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_pf.fit(x, y)
xgb_class2_pf_results = list(xgb_class2_pf.predict(x_pred))

# Linear Regression

lr_class2_pf = LinearRegression().fit(x, y)
lr_class2_pf_results = list(xgb_class2_pf.predict(x_pred))

# Ensemble
class2_pf = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_pf_results, xgb_class2_pf_results, lr_class2_pf_results)]
predictions_2023['PF'] = class2_pf

### MPG

In [43]:
y = list(training_df['Player_Minutes'])


# Random Forest
rf_class2_min = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_min.fit(x, y)
rf_class2_min_results = list(rf_class2_min.predict(x_pred))

# XG Boost
xgb_class2_min = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_min.fit(x, y)
xgb_class2_min_results = list(xgb_class2_min.predict(x_pred))

# Linear Regression

lr_class2_min = LinearRegression().fit(x, y)
lr_class2_min_results = list(xgb_class2_min.predict(x_pred))

# Ensemble
class2_min = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_min_results, xgb_class2_min_results, lr_class2_min_results)]
predictions_2023['MPG'] = class2_min

### 2FGM

In [44]:
y = list(training_df['Player_2FGM'])


# Random Forest
rf_class2_2FGM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_2FGM.fit(x, y)
rf_class2_2FGM_results = list(rf_class2_2FGM.predict(x_pred))

# XG Boost
xgb_class2_2FGM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_2FGM.fit(x, y)
xgb_class2_2FGM_results = list(xgb_class2_2FGM.predict(x_pred))

# Linear Regression

lr_class2_2FGM = LinearRegression().fit(x, y)
lr_class2_2FGM_results = list(xgb_class2_2FGM.predict(x_pred))

# Ensemble
class2_2FGM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_2FGM_results, xgb_class2_2FGM_results, lr_class2_2FGM_results)]
predictions_2023['2FGM'] = class2_2FGM

### 2FGA

In [45]:
y = list(training_df['Player_2FGA'])


# Random Forest
rf_class2_2FGA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_2FGA.fit(x, y)
rf_class2_2FGA_results = list(rf_class2_2FGA.predict(x_pred))

# XG Boost
xgb_class2_2FGA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_2FGA.fit(x, y)
xgb_class2_2FGA_results = list(xgb_class2_2FGA.predict(x_pred))

# Linear Regression

lr_class2_2FGA = LinearRegression().fit(x, y)
lr_class2_2FGA_results = list(xgb_class2_2FGA.predict(x_pred))

# Ensemble
class2_2FGA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_2FGA_results, xgb_class2_2FGA_results, lr_class2_2FGA_results)]
predictions_2023['2FGA'] = class2_2FGA
predictions_2023['2FG%'] = round(predictions_2023['2FGM'] / predictions_2023['2FGA'], 2)

### 3FGM

In [46]:
y = list(training_df['Player_3FGM'])


# Random Forest
rf_class2_3FGM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_3FGM.fit(x, y)
rf_class2_3FGM_results = list(rf_class2_3FGM.predict(x_pred))

# XG Boost
xgb_class2_3FGM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_3FGM.fit(x, y)
xgb_class2_3FGM_results = list(xgb_class2_3FGM.predict(x_pred))

# Linear Regression

lr_class2_3FGM = LinearRegression().fit(x, y)
lr_class2_3FGM_results = list(xgb_class2_3FGM.predict(x_pred))

# Ensemble
class2_3FGM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_3FGM_results, xgb_class2_3FGM_results, lr_class2_3FGM_results)]
predictions_2023['3FGM'] = class2_3FGM

### 3FGA

In [47]:
y = list(training_df['Player_3FGA'])


# Random Forest
rf_class2_3FGA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_3FGA.fit(x, y)
rf_class2_3FGA_results = list(rf_class2_3FGA.predict(x_pred))

# XG Boost
xgb_class2_3FGA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_3FGA.fit(x, y)
xgb_class2_3FGA_results = list(xgb_class2_3FGA.predict(x_pred))

# Linear Regression

lr_class2_3FGA = LinearRegression().fit(x, y)
lr_class2_3FGA_results = list(xgb_class2_3FGA.predict(x_pred))

# Ensemble
class2_3FGA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_3FGA_results, xgb_class2_3FGA_results, lr_class2_3FGA_results)]
predictions_2023['3FGA'] = class2_3FGA
predictions_2023['3FG%'] = round(predictions_2023['3FGM'] / predictions_2023['3FGA'], 2)

### FTM

In [48]:
y = list(training_df['Player_FTM'])


# Random Forest
rf_class2_FTM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_FTM.fit(x, y)
rf_class2_FTM_results = list(rf_class2_FTM.predict(x_pred))

# XG Boost
xgb_class2_FTM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_FTM.fit(x, y)
xgb_class2_FTM_results = list(xgb_class2_FTM.predict(x_pred))

# Linear Regression

lr_class2_FTM = LinearRegression().fit(x, y)
lr_class2_FTM_results = list(xgb_class2_FTM.predict(x_pred))

# Ensemble
class2_FTM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_FTM_results, xgb_class2_FTM_results, lr_class2_FTM_results)]
predictions_2023['FTM'] = class2_FTM

### FTA

In [49]:
y = list(training_df['Player_FTA'])


# Random Forest
rf_class2_FTA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class2_FTA.fit(x, y)
rf_class2_FTA_results = list(rf_class2_FTA.predict(x_pred))

# XG Boost
xgb_class2_FTA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class2_FTA.fit(x, y)
xgb_class2_FTA_results = list(xgb_class2_FTA.predict(x_pred))

# Linear Regression

lr_class2_FTA = LinearRegression().fit(x, y)
lr_class2_FTA_results = list(xgb_class2_FTA.predict(x_pred))

# Ensemble
class2_FTA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class2_FTA_results, xgb_class2_FTA_results, lr_class2_FTA_results)]
predictions_2023['FTA'] = class2_FTA
predictions_2023['FT%'] = round(predictions_2023['FTM'] / predictions_2023['FTA'], 2)

### PPG - Corrected

In [50]:
predictions_2023['PPGc'] = predictions_2023['FTM'] + 2 * predictions_2023['2FGM']+ 3 * predictions_2023['3FGM']

# Part 8 - Modeling - 1 Year

## Part 8.1 - Filter Data

In [51]:
class1_predictions = final_data[(final_data['Player_Name'].isin(one_year_players)) & (final_data['Season'] == 2023)]
class1_predictions['Player_Position'] = class1_predictions['Player_Position'].astype('category')
class1_predictions['Player_Position_C'] = pd.get_dummies(class1_predictions['Player_Position']).astype('bool').iloc[:, 0]
class1_predictions['Player_Position_F'] = pd.get_dummies(class1_predictions['Player_Position']).astype('bool').iloc[:, 1]
class1_predictions['Player_Position_G'] = pd.get_dummies(class1_predictions['Player_Position']).astype('bool').iloc[:, 2]

## Part 8.2 - Models

### PPG

In [52]:
y = list(training_df['Player_Points'])
training_cols = list(training_df.columns[31:])
training_cols = [x for x in training_cols if not x.startswith('2_years')]
x = training_df.loc[:,training_cols]
x_pred = class1_predictions.loc[:,training_cols]



# Random Forest
rf_class1_ppg = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_ppg.fit(x, y)
rf_class1_ppg_results = list(rf_class1_ppg.predict(x_pred))

# XG Boost
xgb_class1_ppg = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_ppg.fit(x, y)
xgb_class1_ppg_results = list(xgb_class1_ppg.predict(x_pred))

# Linear Regression

lr_class1_ppg = LinearRegression().fit(x, y)
lr_class1_ppg_results = list(xgb_class1_ppg.predict(x_pred))

# Ensemble
class1_ppg = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_ppg_results, xgb_class1_ppg_results, lr_class1_ppg_results)]

In [53]:
# Initialize New Projection DataFrame

predictions_2023_1 = pd.DataFrame({'Player_Name': class1_predictions.Player_Name, \
                                'Player_Team': class1_predictions.Player_Team, \
                                'Player_Position': class1_predictions.Player_Position,
                                'PPG': class1_ppg})
predictions_2023_1

Unnamed: 0,Player_Name,Player_Team,Player_Position,PPG
10110,Santi Aldama,MEM,F,5.51
10114,Jose Alvarado,NOP,G,5.92
10122,Joel Ayayi,ORL,G,6.76
10132,Dalano Banton,TOR,F,6.65
10135,Scottie Barnes,TOR,F,14.07
...,...,...,...,...
10655,Ziaire Williams,MEM,F,7.52
10663,James Wiseman,GSW,C,11.91
10666,McKinley Wright IV,DAL,G,5.65
10667,Gabe York,IND,G,3.36


### APG

In [54]:
y = list(training_df['Player_Ast'])

# Random Forest
rf_class1_apg = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_apg.fit(x, y)
rf_class1_apg_results = list(rf_class1_apg.predict(x_pred))

# XG Boost
xgb_class1_apg = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_apg.fit(x, y)
xgb_class1_apg_results = list(xgb_class1_apg.predict(x_pred))

# Linear Regression

lr_class1_apg = LinearRegression().fit(x, y)
lr_class1_apg_results = list(xgb_class1_apg.predict(x_pred))

# Ensemble
class1_apg = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_apg_results, xgb_class1_apg_results, lr_class1_apg_results)]
predictions_2023_1['APG'] = class1_apg

### ORB

In [55]:
y = list(training_df['Player_ORB'])


# Random Forest
rf_class1_orb = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_orb.fit(x, y)
rf_class1_orb_results = list(rf_class1_orb.predict(x_pred))

# XG Boost
xgb_class1_orb = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_orb.fit(x, y)
xgb_class1_orb_results = list(xgb_class1_orb.predict(x_pred))

# Linear Regression

lr_class1_orb = LinearRegression().fit(x, y)
lr_class1_orb_results = list(xgb_class1_orb.predict(x_pred))

# Ensemble
class1_orb = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_orb_results, xgb_class1_orb_results, lr_class1_orb_results)]
predictions_2023_1['ORB'] = class1_orb

### DRB

In [56]:
y = list(training_df['Player_DRB'])


# Random Forest
rf_class1_drb = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_drb.fit(x, y)
rf_class1_drb_results = list(rf_class1_drb.predict(x_pred))

# XG Boost
xgb_class1_drb = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_drb.fit(x, y)
xgb_class1_drb_results = list(xgb_class1_drb.predict(x_pred))

# Linear Regression

lr_class1_drb = LinearRegression().fit(x, y)
lr_class1_drb_results = list(xgb_class1_drb.predict(x_pred))

# Ensemble
class1_drb = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_drb_results, xgb_class1_drb_results, lr_class1_drb_results)]
predictions_2023_1['DRB'] = class1_drb
predictions_2023_1['TRB'] = predictions_2023_1['DRB'] + predictions_2023_1['ORB']

### STL

In [57]:
y = list(training_df['Player_STL'])


# Random Forest
rf_class1_stl = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_stl.fit(x, y)
rf_class1_stl_results = list(rf_class1_stl.predict(x_pred))

# XG Boost
xgb_class1_stl = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_stl.fit(x, y)
xgb_class1_stl_results = list(xgb_class1_stl.predict(x_pred))

# Linear Regression

lr_class1_stl = LinearRegression().fit(x, y)
lr_class1_stl_results = list(xgb_class1_stl.predict(x_pred))

# Ensemble
class1_stl = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_stl_results, xgb_class1_stl_results, lr_class1_stl_results)]
predictions_2023_1['STL'] = class1_stl

### BLK

In [58]:
y = list(training_df['Player_BLK'])


# Random Forest
rf_class1_blk = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_blk.fit(x, y)
rf_class1_blk_results = list(rf_class1_blk.predict(x_pred))

# XG Boost
xgb_class1_blk = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_blk.fit(x, y)
xgb_class1_blk_results = list(xgb_class1_blk.predict(x_pred))

# Linear Regression

lr_class1_blk = LinearRegression().fit(x, y)
lr_class1_blk_results = list(xgb_class1_blk.predict(x_pred))

# Ensemble
class1_blk = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_blk_results, xgb_class1_blk_results, lr_class1_blk_results)]
predictions_2023_1['BLK'] = class1_blk

### TOV

In [59]:
y = list(training_df['Player_TO'])


# Random Forest
rf_class1_to = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_to.fit(x, y)
rf_class1_to_results = list(rf_class1_to.predict(x_pred))

# XG Boost
xgb_class1_to = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_to.fit(x, y)
xgb_class1_to_results = list(xgb_class1_to.predict(x_pred))

# Linear Regression

lr_class1_to = LinearRegression().fit(x, y)
lr_class1_to_results = list(xgb_class1_to.predict(x_pred))

# Ensemble
class1_to = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_to_results, xgb_class1_to_results, lr_class1_to_results)]
predictions_2023_1['TO'] = class1_to

### PF

In [60]:
y = list(training_df['Player_PF'])


# Random Forest
rf_class1_pf = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_pf.fit(x, y)
rf_class1_pf_results = list(rf_class1_pf.predict(x_pred))

# XG Boost
xgb_class1_pf = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_pf.fit(x, y)
xgb_class1_pf_results = list(xgb_class1_pf.predict(x_pred))

# Linear Regression

lr_class1_pf = LinearRegression().fit(x, y)
lr_class1_pf_results = list(xgb_class1_pf.predict(x_pred))

# Ensemble
class1_pf = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_pf_results, xgb_class1_pf_results, lr_class1_pf_results)]
predictions_2023_1['PF'] = class1_pf

### MPG

In [61]:
y = list(training_df['Player_Minutes'])


# Random Forest
rf_class1_min = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_min.fit(x, y)
rf_class1_min_results = list(rf_class1_min.predict(x_pred))

# XG Boost
xgb_class1_min = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_min.fit(x, y)
xgb_class1_min_results = list(xgb_class1_min.predict(x_pred))

# Linear Regression

lr_class1_min = LinearRegression().fit(x, y)
lr_class1_min_results = list(xgb_class1_min.predict(x_pred))

# Ensemble
class1_min = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_min_results, xgb_class1_min_results, lr_class1_min_results)]
predictions_2023_1['MPG'] = class1_min

### 2FGM

In [62]:
y = list(training_df['Player_2FGM'])


# Random Forest
rf_class1_2FGM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_2FGM.fit(x, y)
rf_class1_2FGM_results = list(rf_class1_2FGM.predict(x_pred))

# XG Boost
xgb_class1_2FGM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_2FGM.fit(x, y)
xgb_class1_2FGM_results = list(xgb_class1_2FGM.predict(x_pred))

# Linear Regression

lr_class1_2FGM = LinearRegression().fit(x, y)
lr_class1_2FGM_results = list(xgb_class1_2FGM.predict(x_pred))

# Ensemble
class1_2FGM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_2FGM_results, xgb_class1_2FGM_results, lr_class1_2FGM_results)]
predictions_2023_1['2FGM'] = class1_2FGM

### 2FGA

In [63]:
y = list(training_df['Player_2FGA'])


# Random Forest
rf_class1_2FGA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_2FGA.fit(x, y)
rf_class1_2FGA_results = list(rf_class1_2FGA.predict(x_pred))

# XG Boost
xgb_class1_2FGA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_2FGA.fit(x, y)
xgb_class1_2FGA_results = list(xgb_class1_2FGA.predict(x_pred))

# Linear Regression

lr_class1_2FGA = LinearRegression().fit(x, y)
lr_class1_2FGA_results = list(xgb_class1_2FGA.predict(x_pred))

# Ensemble
class1_2FGA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_2FGA_results, xgb_class1_2FGA_results, lr_class1_2FGA_results)]
predictions_2023_1['2FGA'] = class1_2FGA
predictions_2023_1['2FG%'] = round(predictions_2023_1['2FGM'] / predictions_2023_1['2FGA'], 2)

### 3FGM

In [64]:
y = list(training_df['Player_3FGM'])


# Random Forest
rf_class1_3FGM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_3FGM.fit(x, y)
rf_class1_3FGM_results = list(rf_class1_3FGM.predict(x_pred))

# XG Boost
xgb_class1_3FGM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_3FGM.fit(x, y)
xgb_class1_3FGM_results = list(xgb_class1_3FGM.predict(x_pred))

# Linear Regression

lr_class1_3FGM = LinearRegression().fit(x, y)
lr_class1_3FGM_results = list(xgb_class1_3FGM.predict(x_pred))

# Ensemble
class1_3FGM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_3FGM_results, xgb_class1_3FGM_results, lr_class1_3FGM_results)]
predictions_2023_1['3FGM'] = class1_3FGM

### 3FGA

In [65]:
y = list(training_df['Player_3FGA'])


# Random Forest
rf_class1_3FGA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_3FGA.fit(x, y)
rf_class1_3FGA_results = list(rf_class1_3FGA.predict(x_pred))

# XG Boost
xgb_class1_3FGA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_3FGA.fit(x, y)
xgb_class1_3FGA_results = list(xgb_class1_3FGA.predict(x_pred))

# Linear Regression

lr_class1_3FGA = LinearRegression().fit(x, y)
lr_class1_3FGA_results = list(xgb_class1_3FGA.predict(x_pred))

# Ensemble
class1_3FGA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_3FGA_results, xgb_class1_3FGA_results, lr_class1_3FGA_results)]
predictions_2023_1['3FGA'] = class1_3FGA
predictions_2023_1['3FG%'] = round(predictions_2023_1['3FGM'] / predictions_2023_1['3FGA'], 2)

### FTM

In [66]:
y = list(training_df['Player_FTM'])


# Random Forest
rf_class1_FTM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_FTM.fit(x, y)
rf_class1_FTM_results = list(rf_class1_FTM.predict(x_pred))

# XG Boost
xgb_class1_FTM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_FTM.fit(x, y)
xgb_class1_FTM_results = list(xgb_class1_FTM.predict(x_pred))

# Linear Regression

lr_class1_FTM = LinearRegression().fit(x, y)
lr_class1_FTM_results = list(xgb_class1_FTM.predict(x_pred))

# Ensemble
class1_FTM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_FTM_results, xgb_class1_FTM_results, lr_class1_FTM_results)]
predictions_2023_1['FTM'] = class1_FTM

### FTA

In [67]:
y = list(training_df['Player_FTA'])


# Random Forest
rf_class1_FTA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_class1_FTA.fit(x, y)
rf_class1_FTA_results = list(rf_class1_FTA.predict(x_pred))

# XG Boost
xgb_class1_FTA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_class1_FTA.fit(x, y)
xgb_class1_FTA_results = list(xgb_class1_FTA.predict(x_pred))

# Linear Regression

lr_class1_FTA = LinearRegression().fit(x, y)
lr_class1_FTA_results = list(xgb_class1_FTA.predict(x_pred))

# Ensemble
class1_FTA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_class1_FTA_results, xgb_class1_FTA_results, lr_class1_FTA_results)]
predictions_2023_1['FTA'] = class1_FTA
predictions_2023_1['FT%'] = round(predictions_2023_1['FTM'] / predictions_2023_1['FTA'], 2)

### PPG - Corrected

In [68]:
predictions_2023_1['PPGc'] = predictions_2023_1['FTM'] + 2 * predictions_2023_1['2FGM']+ 3 * predictions_2023_1['3FGM']

# Part 9 - Modeling - Rookies

## Part 9.1 - Filter Data

In [113]:
# Filter to not include this year
training_df_rookies = final_data[(final_data['Season'] >= 2004)]

# Filter to not include multi-team players
training_df_rookies = training_df_rookies[training_df_rookies['Player_Team'] != 'TOT']


# Convert Position to Categorical
training_df_rookies['Player_Position'] = training_df_rookies['Player_Position'].astype('category')
training_df_rookies['Player_Position_C'] = pd.get_dummies(training_df_rookies['Player_Position']).astype('bool').iloc[:, 0]
training_df_rookies['Player_Position_F'] = pd.get_dummies(training_df_rookies['Player_Position']).astype('bool').iloc[:, 1]
training_df_rookies['Player_Position_G'] = pd.get_dummies(training_df_rookies['Player_Position']).astype('bool').iloc[:, 2]
#training_df_rookies = training_df_rookies.drop('Player_Position', axis = 1)

training_df_rookies.head(5)

Unnamed: 0,Season,Player_Name,Player_URL,Player_Position,Player_Age,Player_Team,Player_Games_Played,Player_Games_Started,Player_Minutes,Player_Points,...,1_year_ago_Team_REBR,1_year_ago_Team_EFG,1_year_ago_Team_TS%,1_year_ago_Team_OEF,1_year_ago_Team_DEF,1_year_ago_Player_Age_Sq,2_years_ago_Player_Age_Sq,Player_Position_C,Player_Position_F,Player_Position_G
869,2004,Malik Allen,https://www.basketball-reference.com/players/a...,F,25.0,MIA,45.0,6.0,13.7,4.2,...,49.7,43.9,48.3,81.1,86.2,576.0,529.0,False,False,True
870,2004,Ray Allen*,https://www.basketball-reference.com/players/a...,G,28.0,OKC,56.0,56.0,38.4,23.0,...,49.6,47.2,51.2,88.0,86.9,729.0,676.0,False,False,False
871,2004,Rafer Alston,https://www.basketball-reference.com/players/a...,G,27.0,MIA,82.0,28.0,31.5,10.2,...,49.7,43.9,48.3,81.1,86.2,676.0,625.0,False,False,False
872,2004,Chris Andersen,https://www.basketball-reference.com/players/a...,F,25.0,DEN,71.0,0.0,14.5,3.4,...,51.3,42.8,46.9,74.4,83.6,576.0,529.0,False,False,True
873,2004,Derek Anderson,https://www.basketball-reference.com/players/a...,G,29.0,POR,51.0,46.0,35.5,13.6,...,51.3,48.9,53.1,88.2,85.3,784.0,729.0,False,False,False


In [114]:
training = draft_df_final[draft_df_final['Season'] > 2003].merge(training_df_rookies, how = 'left', on = ['Season','Player_Name'])
x_cols = [
 'Player_Pick',
 'Player_College_Games_Played',
 'Player_College_MPG',
 'Player_College_FGM',
 'Player_College_FGA',
 'Player_College_FG%',
 #'Player_College_2FGM',
 #'Player_College_2FGA',
 #'Player_College_2FG%',
 'Player_College_3FGM',
 'Player_College_3FGA',
 'Player_College_3FG%',
 'Player_College_FTM',
 'Player_College_FTA',
 'Player_College_FT%',
 'Player_College_TRB',
 'Player_College_AST',
 'Player_College_STL',
 'Player_College_BLK',
 'Player_College_TO',
 'Player_College_PF',
 'Player_College_PTS',
 'Player_Age_F',
 'Player_Age_G',
 'Player_Age_C',
 'Player_Games_Played_F',
 'Player_Games_Played_G',
 'Player_Games_Played_C',
 'Player_Games_Started_F',
 'Player_Games_Started_G',
 'Player_Games_Started_C',
 'Player_Minutes_F',
 'Player_Minutes_G',
 'Player_Minutes_C',
 'Player_Points_F',
 'Player_Points_G',
 'Player_Points_C',
 'Player_FGM_F',
 'Player_FGM_G',
 'Player_FGM_C',
 'Player_FGA_F',
 'Player_FGA_G',
 'Player_FGA_C',
 'Player_FGperc_F',
 'Player_FGperc_G',
 'Player_FGperc_C',
 'Player_3FGM_F',
 'Player_3FGM_G',
 'Player_3FGM_C',
 'Player_3FGA_F',
 'Player_3FGA_G',
 'Player_3FGA_C',
 'Player_3FGperc_F',
 'Player_3FGperc_G',
 'Player_3FGperc_C',
 'Player_2FGM_F',
 'Player_2FGM_G',
 'Player_2FGM_C',
 'Player_2FGA_F',
 'Player_2FGA_G',
 'Player_2FGA_C',
 'Player_2FGperc_F',
 'Player_2FGperc_G',
 'Player_2FGperc_C',
 'Player_EFGperc_F',
 'Player_EFGperc_G',
 'Player_EFGperc_C',
 'Player_FTM_F',
 'Player_FTM_G',
 'Player_FTM_C',
 'Player_FTA_F',
 'Player_FTA_G',
 'Player_FTA_C',
 'Player_FTperc_F',
 'Player_FTperc_G',
 'Player_FTperc_C',
 'Player_ORB_F',
 'Player_ORB_G',
 'Player_ORB_C',
 'Player_DRB_F',
 'Player_DRB_G',
 'Player_DRB_C',
 'Player_TRB_F',
 'Player_TRB_G',
 'Player_TRB_C',
 'Player_Ast_F',
 'Player_Ast_G',
 'Player_Ast_C',
 'Player_STL_F',
 'Player_STL_G',
 'Player_STL_C',
 'Player_BLK_F',
 'Player_BLK_G',
 'Player_BLK_C',
 'Player_TO_F',
 'Player_TO_G',
 'Player_TO_C',
 'Player_PF_F',
 'Player_PF_G',
 'Player_PF_C',
 'Player_Pts_F',
 'Player_Pts_G',
 'Player_Pts_C',
 '1_year_ago_Team_Pace',
 '1_year_ago_Team_AST',
 '1_year_ago_Team_TO',
 '1_year_ago_Team_ORR',
 '1_year_ago_Team_DRR',
 '1_year_ago_Team_REBR',
 '1_year_ago_Team_EFG',
 '1_year_ago_Team_TS%',
 '1_year_ago_Team_OEF',
 '1_year_ago_Team_DEF',
 'Player_Position_C',
 'Player_Position_F',
 'Player_Position_G'
]

training['Player_Position_C'] = training['Player_Position_C'].astype(bool)
training['Player_Position_F'] = training['Player_Position_F'].astype(bool)
training['Player_Position_G'] = training['Player_Position_G'].astype(bool)

train_df = training[training['Season'] < 2023]
train_df = train_df.dropna(subset = ['Player_Team_y',]).reset_index().drop('index', axis = 1)
train_df = train_df.replace([np.inf, -np.inf], np.nan)
test_df = training[training['Season'] == 2023].reset_index().drop('index', axis = 1)
test_df = test_df.dropna(subset = ['Player_Team_y',]).reset_index().drop('index', axis = 1)
test_df = test_df.replace([np.inf, -np.inf], np.nan)
x = train_df.loc[:,x_cols]
x_pred = test_df.loc[:,x_cols]

## Part 9.2 - Models

### PPG

In [115]:
y = list(train_df['Player_Points'])

# Random Forest

rf_classR_ppg = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_ppg.fit(x, y)
rf_classR_ppg_results = list(rf_classR_ppg.predict(x_pred))

# XG Boost
xgb_classR_ppg = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_ppg.fit(x, y)
xgb_classR_ppg_results = list(xgb_classR_ppg.predict(x_pred))

# Linear Regression

lr_classR_ppg = LinearRegression().fit(x, y)
lr_classR_ppg_results = list(xgb_classR_ppg.predict(x_pred))

# Ensemble
classR_ppg = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_ppg_results, xgb_classR_ppg_results, lr_classR_ppg_results)]

In [116]:
# Initialize New Projection DataFrame

predictions_2023_R = pd.DataFrame({'Player_Name': test_df.Player_Name, \
                                'Player_Team': test_df.Player_Team_y, \
                                'Player_Position': test_df.Player_Position,
                                'PPG': classR_ppg})
predictions_2023_R

Unnamed: 0,Player_Name,Player_Team,Player_Position,PPG
0,Paolo Banchero,ORL,F,13.61
1,Chet Holmgren,OKC,C,11.84
2,Jabari Smith Jr.,HOU,F,13.34
3,Keegan Murray,SAC,F,10.39
4,Jaden Ivey,DET,G,10.43
5,Bennedict Mathurin,IND,G,6.59
6,Dyson Daniels,NOP,G,10.09
7,Jeremy Sochan,SAS,F,8.08
8,Johnny Davis,WAS,G,6.1
9,Ousmane Dieng,OKC,F,8.11


### APG

In [117]:
y = list(train_df['Player_Ast'])

# Random Forest
rf_classR_apg = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_apg.fit(x, y)
rf_classR_apg_results = list(rf_classR_apg.predict(x_pred))

# XG Boost
xgb_classR_apg = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_apg.fit(x, y)
xgb_classR_apg_results = list(xgb_classR_apg.predict(x_pred))

# Linear Regression

lr_classR_apg = LinearRegression().fit(x, y)
lr_classR_apg_results = list(xgb_classR_apg.predict(x_pred))

# Ensemble
classR_apg = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_apg_results, xgb_classR_apg_results, lr_classR_apg_results)]
predictions_2023_R['APG'] = classR_apg

### ORB

In [118]:
y = list(train_df['Player_ORB'])


# Random Forest
rf_classR_orb = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_orb.fit(x, y)
rf_classR_orb_results = list(rf_classR_orb.predict(x_pred))

# XG Boost
xgb_classR_orb = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_orb.fit(x, y)
xgb_classR_orb_results = list(xgb_classR_orb.predict(x_pred))

# Linear Regression

lr_classR_orb = LinearRegression().fit(x, y)
lr_classR_orb_results = list(xgb_classR_orb.predict(x_pred))

# Ensemble
classR_orb = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_orb_results, xgb_classR_orb_results, lr_classR_orb_results)]
predictions_2023_R['ORB'] = classR_orb

### DRB

In [119]:
y = list(train_df['Player_DRB'])


# Random Forest
rf_classR_drb = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_drb.fit(x, y)
rf_classR_drb_results = list(rf_classR_drb.predict(x_pred))

# XG Boost
xgb_classR_drb = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_drb.fit(x, y)
xgb_classR_drb_results = list(xgb_classR_drb.predict(x_pred))

# Linear Regression

lr_classR_drb = LinearRegression().fit(x, y)
lr_classR_drb_results = list(xgb_classR_drb.predict(x_pred))

# Ensemble
classR_drb = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_drb_results, xgb_classR_drb_results, lr_classR_drb_results)]
predictions_2023_R['DRB'] = classR_drb
predictions_2023_R['TRB'] = predictions_2023_R['DRB'] + predictions_2023_R['ORB']

### SPG

In [120]:
y = list(train_df['Player_STL'])


# Random Forest
rf_classR_stl = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_stl.fit(x, y)
rf_classR_stl_results = list(rf_classR_stl.predict(x_pred))

# XG Boost
xgb_classR_stl = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_stl.fit(x, y)
xgb_classR_stl_results = list(xgb_classR_stl.predict(x_pred))

# Linear Regression

lr_classR_stl = LinearRegression().fit(x, y)
lr_classR_stl_results = list(xgb_classR_stl.predict(x_pred))

# Ensemble
classR_stl = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_stl_results, xgb_classR_stl_results, lr_classR_stl_results)]
predictions_2023_R['STL'] = classR_stl

### BPG

In [121]:
y = list(train_df['Player_BLK'])


# Random Forest
rf_classR_blk = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_blk.fit(x, y)
rf_classR_blk_results = list(rf_classR_blk.predict(x_pred))

# XG Boost
xgb_classR_blk = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_blk.fit(x, y)
xgb_classR_blk_results = list(xgb_classR_blk.predict(x_pred))

# Linear Regression

lr_classR_blk = LinearRegression().fit(x, y)
lr_classR_blk_results = list(xgb_classR_blk.predict(x_pred))

# Ensemble
classR_blk = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_blk_results, xgb_classR_blk_results, lr_classR_blk_results)]
predictions_2023_R['BLK'] = classR_blk

### TOV

In [122]:
y = list(train_df['Player_TO'])


# Random Forest
rf_classR_to = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_to.fit(x, y)
rf_classR_to_results = list(rf_classR_to.predict(x_pred))

# XG Boost
xgb_classR_to = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_to.fit(x, y)
xgb_classR_to_results = list(xgb_classR_to.predict(x_pred))

# Linear Regression

lr_classR_to = LinearRegression().fit(x, y)
lr_classR_to_results = list(xgb_classR_to.predict(x_pred))

# Ensemble
classR_to = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_to_results, xgb_classR_to_results, lr_classR_to_results)]
predictions_2023_R['TO'] = classR_to

### PF

In [123]:
y = list(train_df['Player_PF'])


# Random Forest
rf_classR_pf = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_pf.fit(x, y)
rf_classR_pf_results = list(rf_classR_pf.predict(x_pred))

# XG Boost
xgb_classR_pf = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_pf.fit(x, y)
xgb_classR_pf_results = list(xgb_classR_pf.predict(x_pred))

# Linear Regression

lr_classR_pf = LinearRegression().fit(x, y)
lr_classR_pf_results = list(xgb_classR_pf.predict(x_pred))

# Ensemble
classR_pf = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_pf_results, xgb_classR_pf_results, lr_classR_pf_results)]
predictions_2023_R['PF'] = classR_pf

### MPG

In [124]:
y = list(train_df['Player_Minutes'])


# Random Forest
rf_classR_min = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_min.fit(x, y)
rf_classR_min_results = list(rf_classR_min.predict(x_pred))

# XG Boost
xgb_classR_min = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_min.fit(x, y)
xgb_classR_min_results = list(xgb_classR_min.predict(x_pred))

# Linear Regression

lr_classR_min = LinearRegression().fit(x, y)
lr_classR_min_results = list(xgb_classR_min.predict(x_pred))

# Ensemble
classR_min = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_min_results, xgb_classR_min_results, lr_classR_min_results)]
predictions_2023_R['MPG'] = classR_min

### 2FGM

In [125]:
y = list(train_df['Player_2FGM'])


# Random Forest
rf_classR_2FGM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_2FGM.fit(x, y)
rf_classR_2FGM_results = list(rf_classR_2FGM.predict(x_pred))

# XG Boost
xgb_classR_2FGM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_2FGM.fit(x, y)
xgb_classR_2FGM_results = list(xgb_classR_2FGM.predict(x_pred))

# Linear Regression

lr_classR_2FGM = LinearRegression().fit(x, y)
lr_classR_2FGM_results = list(xgb_classR_2FGM.predict(x_pred))

# Ensemble
classR_2FGM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_2FGM_results, xgb_classR_2FGM_results, lr_classR_2FGM_results)]
predictions_2023_R['2FGM'] = classR_2FGM

### 2FGA

In [126]:
y = list(train_df['Player_2FGA'])


# Random Forest
rf_classR_2FGA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_2FGA.fit(x, y)
rf_classR_2FGA_results = list(rf_classR_2FGA.predict(x_pred))

# XG Boost
xgb_classR_2FGA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_2FGA.fit(x, y)
xgb_classR_2FGA_results = list(xgb_classR_2FGA.predict(x_pred))

# Linear Regression

lr_classR_2FGA = LinearRegression().fit(x, y)
lr_classR_2FGA_results = list(xgb_classR_2FGA.predict(x_pred))

# Ensemble
classR_2FGA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_2FGA_results, xgb_classR_2FGA_results, lr_classR_2FGA_results)]
predictions_2023_R['2FGA'] = classR_2FGA
predictions_2023_R['2FG%'] = round(predictions_2023_R['2FGM'] / predictions_2023_R['2FGA'], 2)

### 3FGM

In [127]:
y = list(train_df['Player_3FGM'])


# Random Forest
rf_classR_3FGM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_3FGM.fit(x, y)
rf_classR_3FGM_results = list(rf_classR_3FGM.predict(x_pred))

# XG Boost
xgb_classR_3FGM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_3FGM.fit(x, y)
xgb_classR_3FGM_results = list(xgb_classR_3FGM.predict(x_pred))

# Linear Regression

lr_classR_3FGM = LinearRegression().fit(x, y)
lr_classR_3FGM_results = list(xgb_classR_3FGM.predict(x_pred))

# Ensemble
classR_3FGM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_3FGM_results, xgb_classR_3FGM_results, lr_classR_3FGM_results)]
predictions_2023_R['3FGM'] = classR_3FGM

### 3FGA

In [128]:
y = list(train_df['Player_3FGA'])


# Random Forest
rf_classR_3FGA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_3FGA.fit(x, y)
rf_classR_3FGA_results = list(rf_classR_3FGA.predict(x_pred))

# XG Boost
xgb_classR_3FGA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_3FGA.fit(x, y)
xgb_classR_3FGA_results = list(xgb_classR_3FGA.predict(x_pred))

# Linear Regression

lr_classR_3FGA = LinearRegression().fit(x, y)
lr_classR_3FGA_results = list(xgb_classR_3FGA.predict(x_pred))

# Ensemble
classR_3FGA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_3FGA_results, xgb_classR_3FGA_results, lr_classR_3FGA_results)]
predictions_2023_R['3FGA'] = classR_3FGA
predictions_2023_R['3FG%'] = round(predictions_2023_R['3FGM'] / predictions_2023_R['3FGA'], 2)

### FTM

In [129]:
y = list(train_df['Player_FTM'])


# Random Forest
rf_classR_FTM = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_FTM.fit(x, y)
rf_classR_FTM_results = list(rf_classR_FTM.predict(x_pred))

# XG Boost
xgb_classR_FTM = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_FTM.fit(x, y)
xgb_classR_FTM_results = list(xgb_classR_FTM.predict(x_pred))

# Linear Regression

lr_classR_FTM = LinearRegression().fit(x, y)
lr_classR_FTM_results = list(xgb_classR_FTM.predict(x_pred))

# Ensemble
classR_FTM = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_FTM_results, xgb_classR_FTM_results, lr_classR_FTM_results)]
predictions_2023_R['FTM'] = classR_FTM

### FTA

In [130]:
y = list(train_df['Player_FTA'])


# Random Forest
rf_classR_FTA = RandomForestRegressor(n_estimators=10, criterion='squared_error', \
                                max_depth=10, min_samples_split=2, min_samples_leaf=1, \
                                min_weight_fraction_leaf=0.0, max_features=1.0, \
                                max_leaf_nodes=None, min_impurity_decrease=0.0, \
                                bootstrap=True, oob_score=False, n_jobs=-1, \
                                random_state=None, verbose=0, warm_start=False, \
                                ccp_alpha=0.0, max_samples=None)

rf_classR_FTA.fit(x, y)
rf_classR_FTA_results = list(rf_classR_FTA.predict(x_pred))

# XG Boost
xgb_classR_FTA = xgb.XGBRegressor(n_estimators = 10, max_depth = 30, max_leaves = 10, \
                          learning_rate = 0.5, n_jobs = -1, gamma = 2)

xgb_classR_FTA.fit(x, y)
xgb_classR_FTA_results = list(xgb_classR_FTA.predict(x_pred))

# Linear Regression

lr_classR_FTA = LinearRegression().fit(x, y)
lr_classR_FTA_results = list(xgb_classR_FTA.predict(x_pred))

# Ensemble
classR_FTA = [round ((a + b + c) / 3, 2) for a, b, c in zip(rf_classR_FTA_results, xgb_classR_FTA_results, lr_classR_FTA_results)]
predictions_2023_R['FTA'] = classR_FTA
predictions_2023_R['FT%'] = round(predictions_2023_R['FTM'] / predictions_2023_R['FTA'], 2)

### PPG - Corrected

In [131]:
predictions_2023_R['PPGc'] = predictions_2023_R['FTM'] + 2 * predictions_2023_R['2FGM']+ 3 * predictions_2023_R['3FGM']

## Add Missing Rookies - Shaedon Sharpe

In [148]:
rookie_guards = predictions_2023_R[predictions_2023_R['Player_Position']=='G']
shaedon_sharpe = pd.DataFrame({
    'Player_Name': ['Shaedon Sharpe'], 
    'Player_Team': ['POR'], 
    'Player_Position': ['G'], 
    'PPG': [np.mean(list(rookie_guards['PPG']))], 
    'APG': [np.mean(list(rookie_guards['APG']))], 
    'ORB': [np.mean(list(rookie_guards['ORB']))],
    'DRB': [np.mean(list(rookie_guards['DRB']))], 
    'TRB': [np.mean(list(rookie_guards['TRB']))], 
    'STL': [np.mean(list(rookie_guards['STL']))], 
    'BLK': [np.mean(list(rookie_guards['BLK']))], 
    'TO': [np.mean(list(rookie_guards['TO']))], 
    'PF': [np.mean(list(rookie_guards['PF']))], 
    'MPG': [np.mean(list(rookie_guards['MPG']))], 
    '2FGM': [np.mean(list(rookie_guards['2FGM']))], 
    '2FGA': [np.mean(list(rookie_guards['2FGA']))], 
    '2FG%': [np.mean(list(rookie_guards['2FG%']))],
    '3FGM': [np.mean(list(rookie_guards['3FGM']))], 
    '3FGA': [np.mean(list(rookie_guards['3FGA']))], 
    '3FG%': [np.mean(list(rookie_guards['3FG%']))], 
    'FTM': [np.mean(list(rookie_guards['FTM']))], 
    'FTA':[np.mean(list(rookie_guards['FTA']))], 
    'FT%': [np.mean(list(rookie_guards['FT%']))], 
    'PPGc': [np.mean(list(rookie_guards['PPGc']))]
})
predictions_2023_R = pd.concat([predictions_2023_R, shaedon_sharpe], ignore_index = True)

# Part 10 - Combine Dataframes & Add Fantasy Scores

In [149]:
all_predictions_2023 = pd.concat([predictions_2023_R, predictions_2023_1, predictions_2023], ignore_index = True)

In [159]:
### ESPN
all_predictions_2023['ESPN'] = all_predictions_2023['PPG'] + all_predictions_2023['3FGM'] - all_predictions_2023['2FGA'] \
- all_predictions_2023['3FGA'] + all_predictions_2023['2FGM'] + all_predictions_2023['3FGM'] + all_predictions_2023['FTM'] \
- all_predictions_2023['FTA'] + all_predictions_2023['TRB'] + 2 * all_predictions_2023['APG'] + \
4 * all_predictions_2023['STL'] + 4 * all_predictions_2023['BLK'] - 2 * all_predictions_2023['TO']

In [163]:
### Yahoo
all_predictions_2023['Yahoo'] = 0.5 * all_predictions_2023['PPG'] + 3 * all_predictions_2023['3FGM'] \
- 0.45 * all_predictions_2023['2FGA'] - 0.45 * all_predictions_2023['3FGA'] + all_predictions_2023['2FGM'] + \
all_predictions_2023['3FGM'] + all_predictions_2023['FTM'] - 0.75 * all_predictions_2023['FTA'] + \
1.5 * all_predictions_2023['TRB'] + 2 * all_predictions_2023['APG'] + \
3 * all_predictions_2023['STL'] + 3 * all_predictions_2023['BLK'] - 2 * all_predictions_2023['TO']

In [166]:
### Sports Data IO
all_predictions_2023['SportsDataIO'] = 3 * all_predictions_2023['3FGM']  + 2 * all_predictions_2023['2FGM'] + \
1 * all_predictions_2023['FTM'] + 1.2 * all_predictions_2023['TRB'] + 1.5 * all_predictions_2023['APG'] + \
2 * all_predictions_2023['BLK'] + 2 * all_predictions_2023['STL'] - all_predictions_2023['TO']

In [171]:
### Average Fantasy Score
all_predictions_2023['Fantasy'] = round((all_predictions_2023['SportsDataIO'] + all_predictions_2023['Yahoo'] \
                                         + all_predictions_2023['ESPN']) / 3,2)

In [175]:
all_predictions_2023.to_csv('v1_2023_predictions.csv')