## 1. Import Required Libraries

In [3]:
import pandas as pd 
import numpy as np 
import joblib
from sklearn.preprocessing import StandardScaler
AdvancedBoxscoreFileName = "C:/Users/aureb/OneDrive - Sport-Data/Documents/TTFL/NBA_project_ML/databases/nba_future_games_df.csv"

## 2. Load the Saved Model

In [22]:
# Load trained LightGBM model
model_path = "../models/best_lgbm_model_v2.pkl"
best_lgbm = joblib.load(model_path)

In [23]:
print(best_lgbm.feature_name_)

['Column_0', 'Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5', 'Column_6', 'Column_7', 'Column_8', 'Column_9', 'Column_10', 'Column_11', 'Column_12', 'Column_13', 'Column_14', 'Column_15', 'Column_16', 'Column_17', 'Column_18', 'Column_19', 'Column_20', 'Column_21', 'Column_22', 'Column_23', 'Column_24', 'Column_25', 'Column_26', 'Column_27', 'Column_28', 'Column_29', 'Column_30', 'Column_31', 'Column_32', 'Column_33', 'Column_34', 'Column_35', 'Column_36', 'Column_37', 'Column_38', 'Column_39', 'Column_40', 'Column_41', 'Column_42', 'Column_43', 'Column_44', 'Column_45', 'Column_46', 'Column_47', 'Column_48', 'Column_49', 'Column_50']


## 3. Load Data

In [12]:
# Load data
boxscore_df = pd.read_csv('../../databases/nba_boxscore_basic.csv', low_memory=False)
advanced_boxscore = pd.read_csv('../../databases/nba_boxscore_advanced.csv', low_memory=False)
players_df = pd.read_csv('../../databases/nba_players_df.csv')
future_games_df = pd.read_csv("../../databases/nba_future_games_df.csv") 

## 4. Merge, Select and Prepare the Model Features

In [13]:
# 1. Merge player metadata (keep only relevant columns)
full_df = boxscore_df.merge(
    players_df[['PERSON_ID', 'HEIGHT', 'WEIGHT', 'POSITION']],
    left_on='personId', right_on='PERSON_ID', how='left'
).drop('PERSON_ID', axis=1)

# 2. Merge advanced stats, keeping only new columns
# Find columns in advanced_boxscore that are not in boxscore_df (except keys)
merge_keys = ['game_id', 'personId', 'teamId']
adv_new_cols = [col for col in advanced_boxscore.columns if col not in boxscore_df.columns or col in merge_keys]

full_df = full_df.merge(
    advanced_boxscore[adv_new_cols],
    on=merge_keys, how='left'
)

### Prepare the DataFrame

In [14]:
# Prepare the DataFrame 
#  Create a copy of the DataFrame for processing
df_to_process = full_df.copy()
# Transform minnutes from string to float
df_to_process['minutes'] = df_to_process['minutes'].apply(lambda x: float(x.split(':')[0]) if pd.notnull(x) else 0)
# fill NaN values in 'position' witch 'BENCH'
df_to_process['position'] = df_to_process['position'].fillna('BENCH')
# Create a new column 'position_group' based on 'POSITION' and 'position' 
df_to_process['position_group'] = df_to_process.apply(
    lambda x: 'G' if x['position'] in ('G', 'BENCH') and x['POSITION'] in ('G', 'G-F') else
              'F' if x['position'] in ('F', 'BENCH') and x['POSITION'] in ('F', 'F-G', 'F-C') else
              'C' if x['position'] in ('C', 'BENCH') and x['POSITION'] in ('C', 'C-F') else x['position'],
    axis=1
)
# Remove rows
df_to_process: pd.DataFrame = df_to_process[df_to_process['comment'].isna() | df_to_process['minutes'].notna()]  # Remove DNP rows
# Change column date type to datetime 
df_to_process['game_date'] = pd.to_datetime(df_to_process['game_date'])
# Add a season column based on the game_id 
df_to_process['season'] = df_to_process['game_id'].astype(str).str[1:3].astype(int) + 2000
# Feature engineering
df_to_process['is_home'] = df_to_process['teamId'] == df_to_process['home_team_id']
df_to_process['opponent'] = np.where(df_to_process['is_home'], df_to_process['visitor_team_id'], df_to_process['home_team_id'])

In [15]:
# 1. Make sure your dates are true datetimes, and filter out bench players
df = df_to_process[df_to_process['position'] != 'BENCH']

# 2. compute one avg_points per group/opponent/game_date
df_avg = (
    df
    .groupby(['position_group','opponent','game_date'])['points']
    .mean()
    .reset_index(name='avg_points')
)

# 3. sort so tail() really pulls the last N by date
df_avg = df_avg.sort_values(['position_group','opponent','game_date'])

# 4. aggregate per (position_group, opponent)
result = (
    df_avg
    .groupby(['position_group','opponent'])
    .apply(lambda g: pd.Series({
        'avg_pts_opp_position_last_10': g['avg_points'].tail(10).mean(),
        'avg_pts_opp_position_last_20': g['avg_points'].tail(20).mean(),
        'avg_pts_opp_position_all'   : g['avg_points'].mean()
    }))
    .reset_index()
)

# 5. Put these stats back on final_df 
final_df = (
    df_to_process
    .merge(result[['position_group','opponent','avg_pts_opp_position_last_10','avg_pts_opp_position_last_20','avg_pts_opp_position_all']],
           on=['position_group','opponent'],
           how='left')
)

### 4.3 Derive Per-36-Minute and Per-Possession Rates
Normalizing by playing time and pace gives features comparable across starters and bench.

In [17]:
# First, compute per-36 metrics useful for player points production
#Create a list of stats to calculate per-36 metrics
keys_stats = [
    'usagePercentage',
    'trueShootingPercentage',
    'effectiveFieldGoalPercentage',
    'offensiveRating',
    'freeThrowsMade',
    'threePointersMade',
    'fieldGoalsMade', 
    'avg_pts_opp_position_all',
    'avg_pts_opp_position_last_10',
    'avg_pts_opp_position_last_20'
]
for stat in keys_stats:
    per36 = f"{stat}_per36"
    final_df[per36] = final_df[stat] / final_df['minutes'] * 36

# And per-possession metrics
for stat in keys_stats:
    ppp = f"{stat}_per_poss"
    final_df[ppp] = final_df[stat] / final_df['possessions']

### 4.4 Rolling Stats 
We rolling only the advanced/rate features

In [18]:
# Rolling the per-36 and per-possesion metrics 
rolling_periods = [5, 10, 20]
keys_stats_without_historical = [
    'usagePercentage',
    'trueShootingPercentage',
    'effectiveFieldGoalPercentage',
    'offensiveRating',
    'freeThrowsMade',
    'threePointersMade',
    'fieldGoalsMade'
]
for period in keys_stats_without_historical:
    for rolling_period in rolling_periods:
        per36 = f"{period}_per36"
        per_poss = f"{period}_per_poss"
        final_df[f"{per36}_rolling_{rolling_period}"] = final_df.groupby('personId')[per36].transform(lambda x: x.rolling(rolling_period, min_periods=1).mean())
        final_df[f"{per_poss}_rolling_{rolling_period}"] = final_df.groupby('personId')[per_poss].transform(lambda x: x.rolling(rolling_period, min_periods=1).mean())

### 4.5 Select Numeric Features
We select the numeric features engineered in the previous steps—including all rolling statistics and historical averages—to serve as input variables for our machine learning model

In [19]:
# Select numeric columns
numeric_feats = []
for rolling_period in rolling_periods:  
    numeric_feats.extend([
        f"{s}_per36_rolling_{rolling_period}" for s in keys_stats_without_historical
    ])
    numeric_feats.extend([
        f"{s}_per_poss_rolling_{rolling_period}" for s in keys_stats_without_historical
    ])
# Add historical averages
numeric_feats.extend([
    'avg_pts_opp_position_last_10_per36',
     'avg_pts_opp_position_last_20_per36',
    'avg_pts_opp_position_all_per36',
    'avg_pts_opp_position_last_10_per_poss',
    'avg_pts_opp_position_last_20_per_poss',
    'avg_pts_opp_position_all_per_poss'
])  

### 4.6 Categorical Features & Encoding

In [20]:
# Create categorical features 
categorical_feats = [
    'is_home',
    'season'
]
# encode categorical features
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(final_df[categorical_feats])

# put the encoded categorical features back into the DataFrame
final_df = final_df.drop(categorical_feats, axis=1)
final_df = pd.concat([final_df, pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_feats))], axis=1)

## 5. Prepare Input DataFrame for the Model

In [24]:
# Prepare the input DataFrame for prediction for each player in each future game
feature_cols = numeric_feats + list(encoder.get_feature_names_out(categorical_feats))

# 1. Merge player info with future games to get all player-game combinations
future_players = players_df[['PERSON_ID', 'PLAYER_FIRST_NAME', 'PLAYER_LAST_NAME', 'TEAM_ID', 'POSITION']]
future_games = future_games_df.rename(columns={
    'GAME_ID': 'game_id',
    'HOME_TEAM_ID': 'home_team_id',
    'VISITOR_TEAM_ID': 'visitor_team_id',
    'SEASON': 'season'
})

# For each future game, get all players from both teams
future_games_long = pd.concat([
    future_games.merge(future_players, left_on='home_team_id', right_on='TEAM_ID'),
    future_games.merge(future_players, left_on='visitor_team_id', right_on='TEAM_ID')
], ignore_index=True)

future_games_long['opponent'] = np.where(
    future_games_long['TEAM_ID'] == future_games_long['home_team_id'],
    future_games_long['visitor_team_id'],
    future_games_long['home_team_id']
)
future_games_long['position_group'] = future_games_long['POSITION'].map(
    lambda x: 'G' if x in ('G', 'G-F') else 'F' if x in ('F', 'F-G', 'F-C') else 'C' if x in ('C', 'C-F') else x
)

# 2. Get the latest stats for each player from final_df
latest_stats = (
    final_df.sort_values('game_date')
    .groupby('personId')
    .tail(1)
    .set_index('personId')
)

# 3. Merge stats into future_games_long
future_games_long = future_games_long.join(latest_stats[feature_cols], on='PERSON_ID')

# 4. Add opponent/position_group stats
future_games_long = future_games_long.merge(
    result,
    left_on=['position_group', 'opponent'],
    right_on=['position_group', 'opponent'],
    how='left'
)

# 5. Prepare input features for the model
X_pred = future_games_long[feature_cols].fillna(0)

# 6. Predict points using the loaded model
future_games_long['projected_points'] = best_lgbm.predict(X_pred)

# 7. Show projections for each player in each future game
projection_cols = [
    'game_id', 'GAME_DATE_EST', 'PERSON_ID', 'PLAYER_FIRST_NAME', 'PLAYER_LAST_NAME',
    'TEAM_ID', 'opponent', 'position_group', 'projected_points'
]

In [28]:
X_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 51 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   usagePercentage_per36_rolling_5                   534 non-null    float64
 1   trueShootingPercentage_per36_rolling_5            534 non-null    float64
 2   effectiveFieldGoalPercentage_per36_rolling_5      534 non-null    float64
 3   offensiveRating_per36_rolling_5                   534 non-null    float64
 4   freeThrowsMade_per36_rolling_5                    534 non-null    float64
 5   threePointersMade_per36_rolling_5                 534 non-null    float64
 6   fieldGoalsMade_per36_rolling_5                    534 non-null    float64
 7   usagePercentage_per_poss_rolling_5                534 non-null    float64
 8   trueShootingPercentage_per_poss_rolling_5         534 non-null    float64
 9   effectiveFieldGoalPer

In [26]:
future_projections = future_games_long[projection_cols]

In [27]:
# Analyse the reparted prjections 
future_projections

Unnamed: 0,game_id,GAME_DATE_EST,PERSON_ID,PLAYER_FIRST_NAME,PLAYER_LAST_NAME,TEAM_ID,opponent,position_group,projected_points
0,22401186,2025-04-13T00:00:00,1631230,Dominick,Barlow,1610612737,1610612753,F,13.468975
1,22401186,2025-04-13T00:00:00,1641723,Kobe,Bufkin,1610612737,1610612753,G,2.885871
2,22401186,2025-04-13T00:00:00,203991,Clint,Capela,1610612737,1610612753,C,1.563606
3,22401186,2025-04-13T00:00:00,1630700,Dyson,Daniels,1610612737,1610612753,G,12.996141
4,22401186,2025-04-13T00:00:00,1631243,Mouhamed,Gueye,1610612737,1610612753,F,3.219250
...,...,...,...,...,...,...,...,...,...
529,22401200,2025-04-13T00:00:00,1628420,Monté,Morris,1610612756,1610612758,G,8.270045
530,22401200,2025-04-13T00:00:00,1626220,Royce,O'Neale,1610612756,1610612758,F,3.508710
531,22401200,2025-04-13T00:00:00,203486,Mason,Plumlee,1610612756,1610612758,F,5.893828
532,22401200,2025-04-13T00:00:00,1630208,Nick,Richards,1610612756,1610612758,C,7.441093
