In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

seasons_stats = pd.read_csv("Seasons_Stats.csv")
nba_positions_full = seasons_stats[seasons_stats['Year'] == 2017][[
    'Player', 'Tm', 'G', 'Pos', 'TOV%', 'TRB%', 'AST%', 'FG%'
]].copy()
nba_positions_full.columns = ['Player', 'Team', 'Games', 'Position', 'TurnoverPct', 'ReboundPct', 'AssistPct', 'FieldGoalPct']
nba_positions_full['FieldGoalPct'] = nba_positions_full['FieldGoalPct'] * 100
position_map = {
    "C": "Center",
    "PF": "PowerForward",
    "PF-C": "PowerForward_Center",
    "PG": "PointGuard",
    "SF": "SmallForward",
    "SG": "ShootingGuard"
}
nba_positions_full['Position'] = nba_positions_full['Position'].map(position_map)
nba_positions = nba_positions_full.groupby(['Player', 'Position']).agg({
    'Games': 'sum',
    'TurnoverPct': 'mean',
    'ReboundPct': 'mean',
    'AssistPct': 'mean',
    'FieldGoalPct': 'mean'
}).reset_index()
nba_positions = nba_positions[
    (nba_positions['Position'].isin(['Center', 'PointGuard', 'ShootingGuard'])) & 
    (nba_positions['Games'] > 10)
].dropna().reset_index(drop=True)
stats_matrix = nba_positions[['TurnoverPct', 'ReboundPct', 'AssistPct', 'FieldGoalPct']].values
k_means = KMeans(n_clusters=3, max_iter=100, n_init=1000, random_state=2)
k_means.fit(stats_matrix)
center_cluster = k_means.predict(stats_matrix[nba_positions['Position'] == 'Center'])[0]
dists_from_centre = np.sqrt(np.sum((stats_matrix - k_means.cluster_centers_[k_means.labels_])**2, axis=1))
nba_positions['Cluster'] = k_means.labels_
nba_positions['Dist'] = dists_from_centre
nba_positions = nba_positions[
    ((nba_positions['Cluster'] == center_cluster) & (nba_positions['Position'] == 'Center')) |
    ((nba_positions['Cluster'] != center_cluster) & (nba_positions['Position'] != 'Center'))
]
centers = nba_positions[nba_positions['Position'] == 'Center'].nsmallest(50, 'Dist')
guards = nba_positions[nba_positions['Position'] != 'Center'].groupby('Position').apply(lambda x: x.sample(min(len(x), 50), random_state=2)).reset_index(drop=True)
nba_positions = pd.concat([centers, guards]).sort_values('Position').reset_index(drop=True)
nba_positions = nba_positions.drop(columns=['Cluster', 'Dist'])
nba_positions_full.to_csv("nba_positions_full.tsv", sep='\t', index=False)
nba_positions.to_csv("nba_positions.tsv", sep='\t', index=False)



In [5]:
all_seasons = pd.read_csv("all_seasons.csv")
stats_columns = ['player_name', 'gp', 'ts_pct', 'ast_pct', 'reb', 'pts']
all_seasons_stats = all_seasons[stats_columns].dropna().copy()
stats_matrix = all_seasons_stats[['ts_pct', 'ast_pct', 'reb', 'pts']].values
k_means = KMeans(n_clusters=3, max_iter=100, n_init=1000, random_state=2)
k_means.fit(stats_matrix)
all_seasons_stats['PositionCluster'] = k_means.labels_
position_map = {0: 'Guard', 1: 'Forward', 2: 'Center'}
all_seasons_stats['Position'] = all_seasons_stats['PositionCluster'].map(position_map)
all_seasons = all_seasons.merge(all_seasons_stats[['player_name', 'Position']], on='player_name', how='left')
all_seasons.to_csv("all_seasons_with_positions.csv", index=False)
all_seasons.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season,Position
0,0,Randy Livingston,HOU,22,193.04,94.800728,Louisiana State,USA,1996,2,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Center
1,0,Randy Livingston,HOU,22,193.04,94.800728,Louisiana State,USA,1996,2,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Center
2,0,Randy Livingston,HOU,22,193.04,94.800728,Louisiana State,USA,1996,2,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Forward
3,0,Randy Livingston,HOU,22,193.04,94.800728,Louisiana State,USA,1996,2,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Center
4,0,Randy Livingston,HOU,22,193.04,94.800728,Louisiana State,USA,1996,2,...,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97,Center
