In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



5 year split for Training set

In [None]:
df = pd.read_csv("NBA_2000_2020.csv")

# We want to filter for 20+ games played for all NBA players
nba_filtered = df[df['GP'] >= 20].copy()

nba_filtered_5 = (
    nba_filtered
    .sort_values(['PLAYER_NAME',"AGE"])  # optional but recommended
    .groupby('PLAYER_NAME')
    .head(5)
)
#nba_filtered_5


player_stats = nba_filtered_5.groupby('PLAYER_NAME').agg({
    'GP': 'mean',
    'MIN': 'mean',
    'PTS': 'mean',
    'REB': 'mean',
    'AST': 'mean',
    'STL': 'mean',
    'BLK': 'mean',
    'FG_PCT': 'mean',
    'FG3_PCT': 'mean',
    'FT_PCT': 'mean',
    'TOV': 'mean',
}).reset_index()

player_stats = player_stats.rename(columns={"PLAYER_NAME":"PLAYER"})
player_stats.to_csv("5YearSplitTraining.csv",index=False)
player_stats


Unnamed: 0,PLAYER,GP,MIN,PTS,REB,AST,STL,BLK,FG_PCT,FG3_PCT,FT_PCT,TOV
0,A.C. Green,82.000000,17.200000,4.500000,3.800000,0.50,0.400000,0.100000,0.444000,0.0000,0.712000,0.500000
1,A.J. Guyton,39.000000,16.200000,5.700000,1.050000,1.85,0.250000,0.200000,0.383500,0.3825,0.824000,0.750000
2,AJ Hammons,22.000000,7.400000,2.200000,1.600000,0.20,0.000000,0.600000,0.405000,0.5000,0.450000,0.500000
3,AJ Price,47.000000,14.040000,5.400000,1.360000,2.04,0.460000,0.040000,0.381600,0.3076,0.611400,0.860000
4,Aaron Brooks,65.000000,22.620000,10.760000,1.700000,3.22,0.580000,0.140000,0.415400,0.3528,0.840000,1.660000
...,...,...,...,...,...,...,...,...,...,...,...,...
1550,Zeljko Rebraca,43.000000,14.720000,5.560000,2.960000,0.36,0.240000,0.700000,0.521800,0.0000,0.789000,0.880000
1551,Zendon Hamilton,50.000000,12.950000,4.850000,3.950000,0.30,0.300000,0.250000,0.478500,0.0000,0.675000,0.850000
1552,Zion Williamson,24.000000,27.800000,22.500000,6.300000,2.10,0.700000,0.400000,0.583000,0.4290,0.640000,2.500000
1553,Zoran Planinic,49.333333,10.666667,3.833333,1.333333,1.10,0.433333,0.066667,0.406667,0.2960,0.675667,0.833333


Training set grouped for clustering

In [None]:
df = pd.read_csv("NBA_2000_2020.csv")

# We want to filter for 20+ games played for all NBA players
nba_filtered = df[df['GP'] >= 20].copy()

player_stats = nba_filtered.groupby('PLAYER_NAME').agg({
    'GP': 'mean',
    'MIN': 'mean',
    'PTS': 'mean',
    'REB': 'mean',
    'AST': 'mean',
    'STL': 'mean',
    'BLK': 'mean',
    'FG_PCT': 'mean',
    'FG3_PCT': 'mean',
    'FT_PCT': 'mean',
    'TOV': 'mean',
}).reset_index()

player_stats = player_stats.rename(columns={"PLAYER_NAME":"PLAYER"})
player_stats.to_csv("NBATrainGrouped.csv",index=False)
player_stats


Unnamed: 0,PLAYER,GP,MIN,PTS,REB,AST,STL,BLK,FG_PCT,FG3_PCT,FT_PCT,TOV
0,A.C. Green,82.000000,17.200000,4.500000,3.800000,0.500000,0.400000,0.100000,0.444000,0.000000,0.712000,0.500000
1,A.J. Guyton,39.000000,16.200000,5.700000,1.050000,1.850000,0.250000,0.200000,0.383500,0.382500,0.824000,0.750000
2,AJ Hammons,22.000000,7.400000,2.200000,1.600000,0.200000,0.000000,0.600000,0.405000,0.500000,0.450000,0.500000
3,AJ Price,43.500000,13.766667,5.350000,1.333333,2.000000,0.433333,0.033333,0.380000,0.300167,0.620667,0.800000
4,Aaron Brooks,64.500000,19.340000,8.880000,1.550000,2.760000,0.530000,0.130000,0.410900,0.362500,0.820000,1.430000
...,...,...,...,...,...,...,...,...,...,...,...,...
1550,Zeljko Rebraca,43.000000,14.720000,5.560000,2.960000,0.360000,0.240000,0.700000,0.521800,0.000000,0.789000,0.880000
1551,Zendon Hamilton,50.000000,12.950000,4.850000,3.950000,0.300000,0.300000,0.250000,0.478500,0.000000,0.675000,0.850000
1552,Zion Williamson,24.000000,27.800000,22.500000,6.300000,2.100000,0.700000,0.400000,0.583000,0.429000,0.640000,2.500000
1553,Zoran Planinic,49.333333,10.666667,3.833333,1.333333,1.100000,0.433333,0.066667,0.406667,0.296000,0.675667,0.833333


Grouped Test Data

In [None]:

df = pd.read_csv("NBA_2020_2025.csv")

# We want to filter for 20+ games played for all NBA players
nba_filtered = df[df['GP'] >= 20].copy()

player_stats = nba_filtered.groupby('PLAYER_NAME').agg({
    'GP': 'mean',
    'MIN': 'mean',
    'PTS': 'mean',
    'REB': 'mean',
    'AST': 'mean',
    'STL': 'mean',
    'BLK': 'mean',
    'FG_PCT': 'mean',
    'FG3_PCT': 'mean',
    'FT_PCT': 'mean',
    'TOV': 'mean',
}).reset_index()

player_stats = player_stats.rename(columns={"PLAYER_NAME":"PLAYER"})
player_stats.to_csv("NBATestGrouped.csv",index=False)


736