# NBA Scoring Analysis

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pprint import pprint
from tqdm import tqdm
import matplotlib.pyplot as plt
from numpy.linalg import inv
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Pick a player

In [2]:
# pick a player (LeBron, Luka, KD, Giannis, Steph)
str_player = 'LeBron'

## Define functions

In [3]:
# define function to get table from url
def get_table(str_url):
    r = requests.get(str_url)
    # get content of page
    soup = BeautifulSoup(r.content, 'html.parser')
    # get the table
    table = soup.find('table', {'id': 'pgl_basic'})
    return table

In [4]:
# linear regression class
class OLSRegression:
    # initialize class
    def __init__(self):
        pass
    # fit the model
    def fit(self, X, y):
        # ordinary least squares b = (XT*X)^-1 * XT*y
        arr_betas = inv(X.T.dot(X)).dot(X.T).dot(y)
        # create dictionary
        dict_col_betas = dict(zip(X.columns, arr_betas))
        # save to class
        self.dict_col_betas = dict_col_betas
        # return self
        return self
    # generate predictions
    def predict(self, X):
        # multiply each cell by its beta
        list_predictions = list(X.dot(pd.Series(self.dict_col_betas)))
        # save to class
        self.list_predictions = list_predictions
        # return self
        return self
    # evaluate performance
    def evaluate(self, y):
        # get mean absolute error
        mae = mean_absolute_error(y_true=y, y_pred=self.list_predictions)
        # get mean squared error
        mse = mean_squared_error(y_true=y, y_pred=self.list_predictions)
        # get root mean square error
        rmse = np.sqrt(mse)
        # get explained variance (R2)
        r2 = r2_score(y_true=y, y_pred=self.list_predictions)
        # create dictionary
        dict_eval_metrics = {'mae': mae, 'mse': mse, 'rmse': rmse, 'r2': r2}
        # return dict_eval_metrics
        return dict_eval_metrics

## Input dictionary

In [5]:
dict_input = {
    'LeBron': {
        'int_start_year': 2004,
        'str_name': '/j/jamesle01',
    },
    'Luka': {
        'int_start_year': 2019,
        'str_name': 'd/doncilu01',
    },
    'KD': {
        'int_start_year': 2008,
        'str_name': 'd/duranke01',
    },
    'Giannis': {
        'int_start_year': 2014,
        'str_name': 'a/antetgi01',
    },
    'Steph': {
        'int_start_year': 2010,
        'str_name': 'c/curryst01',
    },
    
}

## Make output directory

In [6]:
str_dirname_output = f'./{str_player}'
try:
    os.mkdir(str_dirname_output)
except FileExistsError:
    pass

## Scrape all games played

In [7]:
# create list of years
int_start_year = dict_input[str_player]['int_start_year'] # rookie season
int_end_year = 2023 # current season
list_int_year = list(range(int_start_year, int_end_year+1))
#pprint(list_int_year)

In [8]:
# create df
list_dict_row = []
for int_year in tqdm (list_int_year):
    # get table
    str_url = f"https://www.basketball-reference.com/players/{dict_input[str_player]['str_name']}/gamelog/{int_year}"
    table = get_table(str_url=str_url)
    # get the rows
    list_rows = table.findAll('tr')[1:]
    for row in list_rows:
        # get columns
        list_columns = row.findAll('td')
        try:
            # get columns
            list_columns = row.findAll('td')
            # create row
            dict_row = {
                'year': int_year,
                'game_number': int(list_columns[0].text),
                'date': list_columns[1].text,
                'points': int(list_columns[26].text),
            }
            # append
            list_dict_row.append(dict_row)
        except IndexError:
            pass
        except ValueError:
            pass

# create df
df = pd.DataFrame(list_dict_row)
# save locally
str_filename = 'df.csv'
str_local_path = f'{str_dirname_output}/{str_filename}'
df.to_csv(str_local_path, index=False)
# show
df

  0%|                                                    | 0/20 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'findAll'

## Prep data

In [None]:
str_target = 'cumulative_points'
str_filename = 'df.csv'
str_local_path = f'{str_dirname_output}/{str_filename}'
df = pd.read_csv(str_local_path)
# get total games played
df['total_games_played'] = list(range(1, df.shape[0]+1))
# get cumulative points
df[str_target] = np.cumsum(df['points'])
# show
df

## Plot

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
ax.set_title(f'Cumulative Points by Total Games Played ({str_player})')
ax.set_xlabel('Total Games Played')
ax.set_ylabel('Cumulative Points')
ax.plot(df['total_games_played'], df[str_target])
str_filename = 'plt_trend.png'
str_local_path = f'{str_dirname_output}/{str_filename}'
plt.savefig(str_local_path, bbox_inches='tight')
plt.show()

## Fit OLS Regressor

In [None]:
# train-test split
int_n_rows_all = df.shape[0]
print(f'There are {int_n_rows_all} rows in df')
flt_prop_train = 0.66
int_n_rows_train = round(int_n_rows_all * flt_prop_train)
df_train = df.copy().iloc[:int_n_rows_train, :]
print(f'There are {int_n_rows_train} rows in df_train')
df_test = df.copy().iloc[int_n_rows_train:, :]
print(f'There are {df_test.shape[0]} rows in df_test')

In [None]:
# X y split
list_cols = [
    'total_games_played',
]
X_train = df_train[list_cols]
y_train = df_train[str_target]
X_test = df_test[list_cols]
y_test = df_test[str_target]

In [None]:
# initialize model
cls_model_inference = OLSRegression()
# fit model on training data
cls_model_inference.fit(
    X=X_train,
    y=y_train,
)
dict_col_betas = cls_model_inference.dict_col_betas
print('Beta coefficients:')
print(dict_col_betas)
print('')

# in-sample predictions
cls_model_inference.predict(X=X_train)
list_predictions_train = cls_model_inference.list_predictions
# in-sample evaluation
dict_eval_metrics = cls_model_inference.evaluate(y=y_train)
print('In-Sample:')
pprint(dict_eval_metrics)
print('')

# out-of-sample predictions
cls_model_inference.predict(X=X_test)
list_predictions_test = cls_model_inference.list_predictions
# out-of-sample evaluation
dict_eval_metrics = cls_model_inference.evaluate(y=y_test)
print('Out-of-sample:')
pprint(dict_eval_metrics)

## Plot predicted and actual

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
y_hat = list_predictions_train + list_predictions_test
ax.set_title(f'Predicted and Actual Cumulative Points by Total Games Played ({str_player})')
ax.plot(df['total_games_played'], y_hat, color='tab:blue', label='Predicted') # predicted
ax.plot(df['total_games_played'], df[str_target], color='tab:red', label='Actual') # actual
ax.axvline(np.max(df_train['total_games_played']), linestyle='--', label='End of Train') # end of training data
plt.legend()
str_filename = 'plt_y_hat.png'
str_local_path = f'{str_dirname_output}/{str_filename}'
plt.savefig(str_local_path, bbox_inches='tight')
plt.show()

## Find the number of career games at or above 38,388 career points

In [None]:
int_pts_goal = 38388
flt_predicted_points = 0.0
int_total_games_played_start = np.max(df['total_games_played'])+1
list_dict_row = []
while flt_predicted_points < int_pts_goal:
    # get predicted points
    flt_predicted_points = dict_col_betas['total_games_played'] * int_total_games_played_start
    # make row
    dict_row = {
        'total_games_played': int_total_games_played_start,
        'cumulative_points': flt_predicted_points,
    }
    list_dict_row.append(dict_row)
    # increase int_total_games_played_start
    int_total_games_played_start += 1

# df
df_extrapolate = pd.DataFrame(list_dict_row)
str_filename = 'df_extrapolate.csv'
str_local_path = f'{str_dirname_output}/{str_filename}'
df_extrapolate.to_csv(str_local_path, index=False)
# message
int_career_game_record = np.max(df_extrapolate['total_games_played'])
print(f'{str_player} is predicted to break the scoring record ({int_pts_goal-1}) in his {int_career_game_record}th career game')
int_n_games_season = 82
flt_seasons = df_extrapolate.shape[0] / int_n_games_season
print(f'This will take about {flt_seasons:0.2f} seasons ({int_n_games_season} games per season)')
# show
df_extrapolate