Author: Aydin Najl Hossaini  
Date: 24/08/2024

In [3]:
# Imports
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType, Team
from tqdm.notebook import  tqdm
import requests
from bs4 import BeautifulSoup

In [2]:
# Dictionary of team names from bball reference to the web scraper API team names
dict_teams = {"ATL" : Team.ATLANTA_HAWKS, "BOS" : Team.BOSTON_CELTICS, "BRK" : Team.BROOKLYN_NETS, "CHI" : Team.CHICAGO_BULLS, "CHO" : Team.CHARLOTTE_HORNETS, "CLE" : Team.CLEVELAND_CAVALIERS,
"DAL" : Team.DALLAS_MAVERICKS, "DEN" : Team.DENVER_NUGGETS, "DET" : Team.DETROIT_PISTONS, "GSW" : Team.GOLDEN_STATE_WARRIORS, "HOU" : Team.HOUSTON_ROCKETS, "IND" : Team.INDIANA_PACERS, "LAC" : Team.LOS_ANGELES_CLIPPERS, "LAL" : Team.LOS_ANGELES_LAKERS, "MEM" : Team.MEMPHIS_GRIZZLIES,
"MIA" : Team.MIAMI_HEAT, "MIL" : Team.MILWAUKEE_BUCKS, "MIN" : Team.MINNESOTA_TIMBERWOLVES, "NOP" : Team.NEW_ORLEANS_PELICANS, "NYK" : Team.NEW_YORK_KNICKS, "OKC" : Team.OKLAHOMA_CITY_THUNDER,
"ORL" : Team.ORLANDO_MAGIC, "PHI" : Team.PHILADELPHIA_76ERS, "PHO" : Team.PHOENIX_SUNS, "POR" : Team.PORTLAND_TRAIL_BLAZERS, "SAC" : Team.SACRAMENTO_KINGS, "SAS" : Team.SAN_ANTONIO_SPURS,
"TOR" : Team.TORONTO_RAPTORS, "UTA" : Team.UTAH_JAZZ, "WAS" : Team.WASHINGTON_WIZARDS}

def get_table_info(rows : list) -> list:
    """ Extracts the date, opponent, and home/away status from given rows """
    
    game_data = []
    for row in rows:
        if 'thead' in row.get('class', []):  # Skip header rows
            continue

        # Extract the date, opponent, and home/away status
        date = row.find('td', {'data-stat': 'date_game'}).text
        opponent = row.find('td', {'data-stat': 'opp_id'}).text
        home_game = 'Home' if not row.find('td', {'data-stat': 'game_location'}).contents else 'Away' # game_location column is empty for home game but has "@" symbol for away games, check if the field is empty to determine home/away status.

        # Append to the list
        game_data.append([date, home_game, opponent])

    return game_data

# Get all the games of Lebron in the 2018 season
url = "https://www.basketball-reference.com/players/j/jamesle01/gamelog/2018/"
soup = BeautifulSoup(requests.get(url).content, "html.parser") # Gets full page from URL and parses it with HTML parser

if soup is None:
    raise Exception("Failed to load page, check URL")
else: 
    table = soup.find('table', {'id': 'pgl_basic'})  # Find table by ID
    rows = table.find('tbody').find_all('tr') # Find all rows in the table

    game_data = get_table_info(rows)

    for row in game_data:
        row[0] = [int(i) for i in row[0].split("-")] # split the string into day, month, year and convert to integers and keep in the same list

game_data[:3]
        

[[[2017, 10, 17], 'Home', 'BOS'],
 [[2017, 10, 20], 'Away', 'MIL'],
 [[2017, 10, 21], 'Home', 'ORL']]

In [64]:
# Output all advanced player season totals for the 2017-2018 season in CSV format to 2018_10_06_BOS_PBP.csv
def scrape_games(game_data : list) -> list:
    """ Scrapes the play by play data for each game in the game_data list with the format [[year, month, day], home/away, opponent]
    and stores it in a CSV file in the pbp_games folder """

    for game in game_data:
        year, month, day = game[0]

        print(f"Writing play-by-play for Cavs game on {year}-{month}-{day} to CSV file")
        try: # Stores all PBP as CSV's in folder pbp_games
            if game[1] == "Home":
                client.play_by_play(home_team=Team.CLEVELAND_CAVALIERS, year=year, month=month, day=day, output_type=OutputType.CSV, output_file_path=f"pbp_games/{year}_{month}_{day}_CLE_PBP_HOME.csv")
            elif game[1] == "Away":
                client.play_by_play(home_team=dict_teams[game[2]], year=year, month=month, day=day, output_type=OutputType.CSV, output_file_path=f"pbp_games/{year}_{month}_{day}_CLE_PBP_AWAY.csv")
            else:
                print("Error in home/away")
        except Exception:
            print("Failed play by play")
            
# scrape_games(game_data)

Writing play-by-play for Cavs game on 2017-10-17 to CSV file
Writing play-by-play for Cavs game on 2017-10-20 to CSV file
Writing play-by-play for Cavs game on 2017-10-21 to CSV file
Writing play-by-play for Cavs game on 2017-10-24 to CSV file
Writing play-by-play for Cavs game on 2017-10-25 to CSV file
Writing play-by-play for Cavs game on 2017-10-28 to CSV file
Writing play-by-play for Cavs game on 2017-10-29 to CSV file
Writing play-by-play for Cavs game on 2017-11-1 to CSV file
Writing play-by-play for Cavs game on 2017-11-3 to CSV file
Writing play-by-play for Cavs game on 2017-11-5 to CSV file
Writing play-by-play for Cavs game on 2017-11-7 to CSV file
Writing play-by-play for Cavs game on 2017-11-9 to CSV file
Writing play-by-play for Cavs game on 2017-11-11 to CSV file
Writing play-by-play for Cavs game on 2017-11-13 to CSV file
Writing play-by-play for Cavs game on 2017-11-15 to CSV file
Writing play-by-play for Cavs game on 2017-11-17 to CSV file
Writing play-by-play for Cavs

# Data processing

In [32]:
# Filter the useful information from the play by play data
import pandas as pd
pd.options.mode.copy_on_write = True # Allows for chained assignment 

LBJ_life_ft_percent = 73.6

game = pd.read_csv("pbp_games/2017_10_17_CLE_PBP_HOME.csv")
game.dtypes

period                           int64
period_type                     object
remaining_seconds_in_period    float64
relevant_team                   object
away_team                       object
home_team                       object
away_score                       int64
home_score                       int64
description                     object
dtype: object

In [33]:
game.head(2)

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description
0,1,QUARTER,704.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,K. Irving makes 2-pt jump shot from 10 ft (ass...
1,1,QUARTER,687.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,D. Rose misses 2-pt layup from 1 ft (block by ...


## Decide on features
 1. Time remaining in the quarter
 2. Score difference
 3. Home/Away team
 4. Period
 5. how many made shots in the game (=FG%)  
     Obtain this through the description column and filter for "makes" and "misses" on player level
        Notable words:  
        misses 2-pt  
        misses 3-pt  
        misses free throw  
        makes free throw  
        makes 2-pt  
        makes 3-pt  
6. minutes played


In [34]:
# Add unique event id for join
game["event_id"] = range(1, len(game) + 1)

In [35]:
# Transfer period type + remaining_seconds_in_period to a time format
# Full quarter has 720 seconds, therefore 720 - remaining_seconds_in_period = seconds played in the quarter
# If quarter is 2, 2*720 - remaining_seconds_in_period ;  3 + 3*720 - ..., 4 + 4*720 - ... 
# Adds game_time passed column
game["game_time"] = game.apply(lambda x: x['period']*720 - x['remaining_seconds_in_period'], axis=1)
game.tail()

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time
460,4,QUARTER,6.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,Defensive rebound by J. Tatum,461,2874.0
461,4,QUARTER,2.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,J. Brown misses 3-pt jump shot from 25 ft,462,2878.0
462,4,QUARTER,2.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,Offensive rebound by K. Irving,463,2878.0
463,4,QUARTER,0.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,K. Irving misses 3-pt jump shot from 26 ft,464,2880.0
464,4,QUARTER,0.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,Offensive rebound by Team,465,2880.0


## Add score difference column

In [36]:
# Necessary function due to some games played away
def game_diff(row):
    """ Calculate the score difference for each game """

    if row["home_team"] == "CLEVELAND CAVALIERS":
        return row["home_score"] - row["away_score"]
    elif row["away_team"] == "CLEVELAND CAVALIERS":
        return row["away_score"] - row["home_score"]
    else:
        raise Exception("Cavs not in the game")
    
# Adds score diff column
game["score_diff"] = game.apply(game_diff, axis=1)
game.head(2)

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time,score_diff
0,1,QUARTER,704.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,K. Irving makes 2-pt jump shot from 10 ft (ass...,1,16.0,-2
1,1,QUARTER,687.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,D. Rose misses 2-pt layup from 1 ft (block by ...,2,33.0,-2


## Plays at home column

In [37]:
# Adds at_home column
game["at_home"] = game["home_team"] == "CLEVELAND CAVALIERS"
game.head(2)

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time,score_diff,at_home
0,1,QUARTER,704.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,K. Irving makes 2-pt jump shot from 10 ft (ass...,1,16.0,-2,True
1,1,QUARTER,687.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,D. Rose misses 2-pt layup from 1 ft (block by ...,2,33.0,-2,True
2,1,QUARTER,683.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,Defensive rebound by A. Horford,3,37.0,-2,True
3,1,QUARTER,681.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,G. Hayward misses 3-pt jump shot from 25 ft,4,39.0,-2,True
4,1,QUARTER,678.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,Defensive rebound by D. Rose,5,42.0,-2,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,4,QUARTER,6.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,Defensive rebound by J. Tatum,461,2874.0,3,True
461,4,QUARTER,2.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,J. Brown misses 3-pt jump shot from 25 ft,462,2878.0,3,True
462,4,QUARTER,2.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,Offensive rebound by K. Irving,463,2878.0,3,True
463,4,QUARTER,0.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,K. Irving misses 3-pt jump shot from 26 ft,464,2880.0,3,True


In [None]:
# Playing time very hard to quantify due to starters and starters of quarters being unknown

# game[game["description"].str.contains("enters the game for L. James")]
# game[game["description"].str.contains("L. James enters the game for")]


## Calculating cum FGM with new DF

In [38]:
# Selects all the field goals made by Lebron
field_goals = game[game["description"].str.contains("L. James") & game["relevant_team"].str.contains("CLE") 
                   &~ game["description"].str.contains("assist by L. James") &~ game["description"].str.contains("free throw")]
     

made_mask = field_goals['description'].str.contains('makes 2-pt|makes 3-pt')
attempted_mask = field_goals['description'].str.contains('makes 2-pt|misses 2-pt|makes 3-pt|misses 3-pt')

# Adds field goals made and attempted columns
field_goals.loc[:, 'field_goals_made'] = made_mask.astype(int)
field_goals.loc[:, 'field_goals_attempted'] = attempted_mask.astype(int)

# Create cumulative totals columns using .loc
field_goals.loc[:, 'cumulative_field_goals_made'] = field_goals['field_goals_made'].cumsum()
field_goals.loc[:, 'cumulative_field_goals_attempted'] = field_goals['field_goals_attempted'].cumsum()

field_goals.tail(3)




Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time,score_diff,at_home,field_goals_made,field_goals_attempted,cumulative_field_goals_made,cumulative_field_goals_attempted
448,4,QUARTER,79.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,98,99,L. James makes 2-pt layup from 2 ft,449,2801.0,1,True,1,1,12,18
458,4,QUARTER,31.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,Defensive rebound by L. James,459,2849.0,3,True,0,0,12,18
459,4,QUARTER,9.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,99,102,L. James misses 3-pt jump shot from 26 ft,460,2871.0,3,True,0,1,12,19


## Merge dataframes to contain all features

In [39]:
# Merge the game and field goals dataframes to get the cumulative field goals made and attempted
joined_df = pd.merge(game, 
                     field_goals[['event_id', 'cumulative_field_goals_made', 'cumulative_field_goals_attempted']],
                     on='event_id', 
                     how='left')

# Forward fill the NaN values so highest number is used
joined_df['cumulative_field_goals_made'] = joined_df['cumulative_field_goals_made'].ffill().fillna(0)
joined_df['cumulative_field_goals_attempted'] = joined_df['cumulative_field_goals_attempted'].ffill().fillna(0)


joined_df.iloc[155:160, :]

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
155,2,QUARTER,458.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,36,J. Brown enters the game for T. Rozier,156,982.0,11,True,4.0,6.0
156,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,36,Personal foul by S. Ojeleye (drawn by L. James),157,986.0,11,True,4.0,6.0
157,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,37,L. James makes free throw 1 of 2,158,986.0,12,True,4.0,6.0
158,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,38,L. James makes free throw 2 of 2,159,986.0,13,True,4.0,6.0
159,2,QUARTER,442.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,38,J. Brown misses 2-pt layup from 1 ft,160,998.0,13,True,4.0,6.0


## Select free throws rows for prediction

In [40]:
# Select all the free throws attempted by Lebron
joined_df[joined_df["description"].str.contains("L. James") & joined_df["description"].str.contains("free throw")] # 2. Lebron made 8 free throws in the game

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
157,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,37,L. James makes free throw 1 of 2,158,986.0,12,True,4.0,6.0
158,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,38,L. James makes free throw 2 of 2,159,986.0,13,True,4.0,6.0
384,4,QUARTER,519.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,79,79,L. James makes free throw 1 of 2,385,2361.0,0,True,8.0,13.0
385,4,QUARTER,519.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,79,80,L. James makes free throw 2 of 2,386,2361.0,1,True,8.0,13.0


In [41]:
# If madea free throw is in the description, add a column free_throw_made
joined_df["free_throw_made"] = joined_df["description"].str.contains("makes free throw").astype(int)


In [42]:
# Create free throws df
free_throws = joined_df[joined_df["description"].str.contains("L. James") & joined_df["description"].str.contains("free throw")]

# Machine Learning

## Imports

In [73]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Selecting features 

In [68]:
X = free_throws.drop(columns=['away_team', 'home_team', 'away_score', 'home_score', 'description', 'relevant_team', 'period_type', "event_id", "free_throw_made"])
X["at_home"]  = X["at_home"].astype(int)
y = free_throws["free_throw_made"]

### Scaling columns

In [61]:
# Scale the continuous columns 
cols_to_scale = ['remaining_seconds_in_period', 'score_diff', 
                 'cumulative_field_goals_made', 'cumulative_field_goals_attempted']

X[cols_to_scale] = StandardScaler().fit_transform(X[cols_to_scale])
X

Unnamed: 0,period,remaining_seconds_in_period,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
157,2,-1.0,986.0,0.9135,1,-1.0,-1.0
158,2,-1.0,986.0,1.079591,1,-1.0,-1.0
384,4,1.0,2361.0,-1.079591,1,1.0,1.0
385,4,1.0,2361.0,-0.9135,1,1.0,1.0


### One hot encode categorical features

In [62]:
# One hot encode the period column
dum_df = pd.get_dummies(X["period"], prefix="period", dtype=int)
dum_df = dum_df.join(X)
X = dum_df.drop(columns=["period"])

X

Unnamed: 0,period,remaining_seconds_in_period,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
157,2,-1.0,986.0,0.9135,1,-1.0,-1.0
158,2,-1.0,986.0,1.079591,1,-1.0,-1.0
384,4,1.0,2361.0,-1.079591,1,1.0,1.0
385,4,1.0,2361.0,-0.9135,1,1.0,1.0


In [64]:
y

Unnamed: 0,event_id,free_throw_made
157,158,1
158,159,1
384,385,1
385,386,1


## Splitting data

In [69]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [70]:
y_train

385    1
157    1
384    1
Name: free_throw_made, dtype: int64

## Model development and prediction

In [71]:
# TODO does not work because all values are 1 
# logreg = LogisticRegression(random_state=16)

# logreg.fit(X_train, y_train)

# y_pred = logreg.predict(X_test)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)

## Evaluation

In [None]:
# import the metrics class

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix