Author: Aydin Najl Hossaini  
Date: 24/08/2024

# Imports

In [1]:
import pandas as pd
from tqdm.notebook import  tqdm
import os

In [None]:
# All game ids in pbp_games folder
game_names = [name for name in os.listdir("pbp_games") if name.endswith("csv")]

game_ids = [id for id in range(1, len(game_names) + 1)]
game_id_map = dict(zip(game_names, game_ids))



dict_values([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21])

# Data processing

## Options

In [2]:
pd.options.mode.copy_on_write = True # Allows for chained assignment 
LBJ_life_ft_percent = 73.6

## Game ids

In [3]:
# All game ids in pbp_games folder
game_names = [name for name in os.listdir("pbp_games") if name.endswith("csv")]

game_ids = [id for id in range(1, len(game_names) + 1)]
game_id_map = dict(zip(game_names, game_ids))


In [5]:
# Test one game
game = pd.read_csv("pbp_games/2017_10_17_CLE_PBP_HOME.csv")
game.info()

# TODO: Add game id column to each game
# TODO: functionize each column add section so that it can be applied to each game
# for name, id in game_id_map.items():
#     game = pd.read_csv("pbp_games/" + name)
#     game["game_id"] = id
    


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465 entries, 0 to 464
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   period                       465 non-null    int64  
 1   period_type                  465 non-null    object 
 2   remaining_seconds_in_period  465 non-null    float64
 3   relevant_team                465 non-null    object 
 4   away_team                    465 non-null    object 
 5   home_team                    465 non-null    object 
 6   away_score                   465 non-null    int64  
 7   home_score                   465 non-null    int64  
 8   description                  465 non-null    object 
dtypes: float64(1), int64(3), object(5)
memory usage: 32.8+ KB


In [2]:
game.head(2)

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description
0,1,QUARTER,704.0,BOSTON CELTICS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,K. Irving makes 2-pt jump shot from 10 ft (ass...
1,1,QUARTER,687.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,2,0,D. Rose misses 2-pt layup from 1 ft (block by ...


## Decide on features
 1. Time remaining in the quarter
 2. Score difference
 3. Home/Away team
 4. Period
 5. how many made shots in the game (=FG%)  
     Obtain this through the description column and filter for "makes" and "misses" on player level
        Notable words:  
        misses 2-pt  
        misses 3-pt  
        misses free throw  
        makes free throw  
        makes 2-pt  
        makes 3-pt  
6. minutes played


In [7]:
# Add unique event id for join
def add_event_id_col(game : pd.DataFrame):
    game["event_id"] = range(1, len(game) + 1)
    return game

In [8]:
# Transfer period type + remaining_seconds_in_period to a time format
# Full quarter has 720 seconds, therefore 720 - remaining_seconds_in_period = seconds played in the quarter
# If quarter is 2, 2*720 - remaining_seconds_in_period ;  3 + 3*720 - ..., 4 + 4*720 - ... 
# Adds game_time passed column
def add_game_time_col(game: pd.DataFrame):
    game["game_time"] = game.apply(lambda x: x['period']*720 - x['remaining_seconds_in_period'], axis=1)
    return game

## Add score difference column

In [11]:
# Necessary function due to some games played away
def game_diff(row):
    """ Calculate the score difference for each game """

    if row["home_team"] == "CLEVELAND CAVALIERS":
        return row["home_score"] - row["away_score"]
    elif row["away_team"] == "CLEVELAND CAVALIERS":
        return row["away_score"] - row["home_score"]
    else:
        raise Exception("Cavs not in the game")
    
# Adds score diff column
def add_score_diff_col(game: pd.DataFrame):
    game["score_diff"] = game.apply(game_diff, axis=1)
    return game

## Plays at home column

In [12]:
# Adds at_home column
def add_at_home_col(game: pd.DataFrame):
    game["at_home"] = game["home_team"] == "CLEVELAND CAVALIERS"
    return game

In [7]:
# Playing time very hard to quantify due to starters and starters of quarters being unknown

# game[game["description"].str.contains("enters the game for L. James")]
# game[game["description"].str.contains("L. James enters the game for")]


## Calculating cum FGM with new DF

In [14]:
def create_fg_df(game: pd.DataFrame):
    '''
    Selects Lebron FGs based on keywords and adds cumulative count of FGs as column
    '''
    # Selects all the field goals made by Lebron
    field_goals = game[game["description"].str.contains("L. James") & game["relevant_team"].str.contains("CLE") 
                    &~ game["description"].str.contains("assist by L. James") &~ game["description"].str.contains("free throw")]
        

    made_mask = field_goals['description'].str.contains('makes 2-pt|makes 3-pt')
    attempted_mask = field_goals['description'].str.contains('makes 2-pt|misses 2-pt|makes 3-pt|misses 3-pt')

    # Adds field goals made and attempted columns
    field_goals.loc[:, 'field_goals_made'] = made_mask.astype(int)
    field_goals.loc[:, 'field_goals_attempted'] = attempted_mask.astype(int)

    # Create cumulative totals columns using .loc
    field_goals.loc[:, 'cumulative_field_goals_made'] = field_goals['field_goals_made'].cumsum()
    field_goals.loc[:, 'cumulative_field_goals_attempted'] = field_goals['field_goals_attempted'].cumsum()
    
    return field_goals




## Merge dataframes to contain all features

In [4]:
def add_fg_cols(game: pd.DataFrame):
    '''
    Create new Dataframe including cum FGM and cum FGA
    '''
    # Create new DF with FGs 
    field_goals = create_fg_df(game)

    # Merge the game and field goals dataframes to get the cumulative field goals made and attempted
    joined_df = pd.merge(game, 
                        field_goals[['event_id', 'cumulative_field_goals_made', 'cumulative_field_goals_attempted']],
                        on='event_id', 
                        how='left')

    # Forward fill the NaN values so highest number is used
    joined_df['cumulative_field_goals_made'] = joined_df['cumulative_field_goals_made'].ffill().fillna(0)
    joined_df['cumulative_field_goals_attempted'] = joined_df['cumulative_field_goals_attempted'].ffill().fillna(0)
    
    return joined_df


## Select free throws rows for prediction

In [19]:
def create_ft_df(joined_df: pd.DataFrame):

    # Select all the free throws attempted by Lebron
    joined_df[joined_df["description"].str.contains("L. James") & joined_df["description"].str.contains("free throw")] # 2. Lebron made 8 free throws in the game
    
    # If madea free throw is in the description, add a column free_throw_made
    joined_df["free_throw_made"] = joined_df["description"].str.contains("makes free throw").astype(int)

    free_throws = joined_df[joined_df["description"].str.contains("L. James") & joined_df["description"].str.contains("free throw")]

    return free_throws

In [23]:
def preprocess_game(game: pd.DataFrame):
    '''
    Adds multiple columns to the dataframe
    '''
    game = add_event_id_col(game)
    game = add_game_time_col(game)
    game = add_score_diff_col(game)
    game = add_at_home_col(game)
    return game

game = preprocess_game(game)
joined_df = add_fg_cols(game)
free_throws = create_ft_df(joined_df)

free_throws

Unnamed: 0,period,period_type,remaining_seconds_in_period,relevant_team,away_team,home_team,away_score,home_score,description,event_id,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
157,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,37,L. James makes free throw 1 of 2,158,986.0,12,True,4.0,6.0
158,2,QUARTER,454.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,25,38,L. James makes free throw 2 of 2,159,986.0,13,True,4.0,6.0
384,4,QUARTER,519.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,79,79,L. James makes free throw 1 of 2,385,2361.0,0,True,8.0,13.0
385,4,QUARTER,519.0,CLEVELAND CAVALIERS,BOSTON CELTICS,CLEVELAND CAVALIERS,79,80,L. James makes free throw 2 of 2,386,2361.0,1,True,8.0,13.0


# Machine Learning

## Imports

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Selecting features 

In [14]:
X = free_throws.drop(columns=['away_team', 'home_team', 'away_score', 'home_score', 'description', 'relevant_team', 'period_type', "event_id", "free_throw_made"])
X["at_home"]  = X["at_home"].astype(int)
y = free_throws["free_throw_made"]

### Scaling columns

In [15]:
# Scale the continuous columns 
cols_to_scale = ['remaining_seconds_in_period', 'score_diff', 
                 'cumulative_field_goals_made', 'cumulative_field_goals_attempted']

X[cols_to_scale] = StandardScaler().fit_transform(X[cols_to_scale])
X

Unnamed: 0,period,remaining_seconds_in_period,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
157,2,-1.0,986.0,0.9135,1,-1.0,-1.0
158,2,-1.0,986.0,1.079591,1,-1.0,-1.0
384,4,1.0,2361.0,-1.079591,1,1.0,1.0
385,4,1.0,2361.0,-0.9135,1,1.0,1.0


### One hot encode categorical features

In [16]:
# One hot encode the period column
dum_df = pd.get_dummies(X["period"], prefix="period", dtype=int)
dum_df = dum_df.join(X)
X = dum_df.drop(columns=["period"])

X

Unnamed: 0,period_2,period_4,remaining_seconds_in_period,game_time,score_diff,at_home,cumulative_field_goals_made,cumulative_field_goals_attempted
157,1,0,-1.0,986.0,0.9135,1,-1.0,-1.0
158,1,0,-1.0,986.0,1.079591,1,-1.0,-1.0
384,0,1,1.0,2361.0,-1.079591,1,1.0,1.0
385,0,1,1.0,2361.0,-0.9135,1,1.0,1.0


In [17]:
y

157    1
158    1
384    1
385    1
Name: free_throw_made, dtype: int64

## Splitting data

In [18]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [19]:
y_train

385    1
157    1
384    1
Name: free_throw_made, dtype: int64

## Model development and prediction

In [20]:
# TODO does not work because all values are 1 
# logreg = LogisticRegression(random_state=16)

# logreg.fit(X_train, y_train)

# y_pred = logreg.predict(X_test)

## Evaluation

In [21]:
# import the metrics class

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

NameError: name 'y_pred' is not defined