In [49]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from joblib import dump, load

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans

In [51]:
def scale_data(year):
    yearMinutes = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_players/wnba_players_{year}.csv')
    yearMinutes = yearMinutes.drop_duplicates(subset = ['Name', 'Team'], keep = 'first')
    tot_min = yearMinutes['Total Minutes']
    yearMinutes = yearMinutes[['Name', 'Team']]
    
    yearShooting = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_shooting/wnba_players_{year}_shooting.csv')
    yearShooting = yearShooting.drop_duplicates(subset = ['Player', 'Team'], keep = 'first')
    
    yearUsage = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_usage/wnba_players_{year}_usage.csv')
    yearUsage = yearUsage.drop_duplicates(subset = ['player', 'team'], keep = 'first')
    
    yearShootingLocation = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_shooting_location/wnba_players_{year}_shooting_location.csv')
    yearShootingLocation = yearShootingLocation.drop_duplicates(subset = ['Player', 'Team'], keep = 'first')

    dfYear = pd.merge(yearShooting, yearUsage, left_on = ['Player', 'Team'], right_on = ['player', 'team'], how = 'inner', validate='one_to_one')
    dfYear = pd.merge(dfYear, yearShootingLocation, on = ['Player', 'Team'], how = 'inner', validate='one_to_one')
    dfYear = pd.merge(dfYear, yearMinutes, left_on = ['Player', 'Team'], right_on = ['Name', 'Team'], how = 'inner', validate='one_to_one')

    dfYear = dfYear.drop(['player', 'team', 'Age', 'RA_FGM', 'RA_FG%', 'ITP_FGM', 'ITP_FG%', 'MR_FGM', 'MR_FG%', 'LC3_FGM', 'LC3_FG%', 'RC3_FGM', 'RC3_FG%', 'AB3_FGM', 'AB3_FG%', 'Name'], axis = 1)

    cols_to_convert = [col for col in dfYear.columns if col not in ['Player', 'Team']]
    dfYear[cols_to_convert] = dfYear[cols_to_convert].apply(pd.to_numeric, errors='coerce').fillna(0)

    dfYear.iloc[:, 2:] = scale(dfYear.iloc[:, 2:])
    dfYear['year'] = year
    dfYear['Total Minutes'] = tot_min

    dfYear.to_csv(f'WNBA_scaled_means_data/WNBA_scaled_{year}.csv', index = False)

In [53]:
for year in list(range(1997, 2025)):
    scale_data(year)

 -1.32142857 -1.10714286 -1.07142857 -0.96428571 -0.89285714 -0.85714286
 -0.82142857 -0.71428571 -0.42857143 -0.39285714 -0.35714286 -0.32142857
 -0.28571429 -0.17857143 -0.14285714 -0.10714286  0.          0.07142857
  0.46428571  0.5         0.60714286  0.64285714  0.67857143  0.71428571
  0.82142857  0.89285714  0.92857143  1.          1.07142857  1.10714286
  1.17857143  1.32142857  1.35714286  1.39285714  1.46428571  1.5
  1.57142857  1.64285714 -1.17857143 -0.5        -0.25        0.10714286
  0.39285714  0.42857143  1.03571429 -0.64285714 -0.57142857  0.17857143
  1.21428571  1.25        1.53571429 -1.71428571 -1.03571429 -0.78571429
  0.35714286  1.60714286  0.75        1.28571429 -1.14285714 -0.92857143
  0.32142857 -0.53571429  0.14285714 -1.39285714  0.28571429 -1.25
 -0.21428571 -1.46428571 -1.         -0.60714286  1.67857143  1.14285714
 -1.21428571 -0.67857143 -1.60714286 -0.75        0.85714286  0.57142857
 -1.28571429  0.96428571 -0.07142857  0.03571429  0.53571429  1.

In [55]:
# Generate the list of file paths
file_paths = [f'WNBA_scaled_means_data/WNBA_scaled_{year}.csv' for year in range(1997, 2025)]

# Concatenate all files into one DataFrame
wnba_df = pd.concat((pd.read_csv(f) for f in file_paths), ignore_index=True)
wnba_df = wnba_df.drop(['Unnamed: 0', 'Year'], axis = 1)

print(wnba_df.shape)
wnba_df.head()

(4508, 31)


Unnamed: 0,Player,Team,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_MR,pct_PTS_FB,pct_PTS_FT,pct_PTS_IP,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,ITP_FGA,MR_FGA,LC3_FGA,RC3_FGA,AB3_FGA,year,Total Minutes
0,ANDREA CONGREAVES,CHA,-0.516092,0.669738,0.66362,-0.604152,-0.037217,-0.102525,0.109132,0.146817,1.007227,-0.569614,-1.119218,-0.800698,0.386819,0.161039,0.447429,0.392534,-0.335621,-1.010934,-0.087672,-0.177785,-0.309595,-0.041556,-0.72006,-0.662048,2.085147,0.67653,0.317209,1997,660.8
1,ANDREA STINSON,CHA,0.115579,-0.035311,0.081656,3.014046,-0.493994,0.555107,-0.459022,0.765442,0.541718,0.437175,1.352397,1.224769,0.642066,0.317052,0.101643,-0.069853,1.01163,0.286134,-0.087672,-0.669536,-0.309595,2.292166,1.419823,1.891564,0.526909,1.31365,0.866659,1997,1010.8
2,BRIDGET PETTIS,PHO,-0.785062,0.969953,0.934875,2.314528,0.198539,-0.61349,-0.139435,0.417466,0.716284,0.059629,1.123929,0.867334,1.439715,0.560822,-0.190946,-0.345507,0.61952,0.53866,1.063029,-0.567794,-0.309595,1.584978,-0.292083,0.189156,1.461852,-0.172964,2.593504,1997,831.6
3,BRIDGETTE GORDON,SAC,0.392699,-0.344623,-0.342487,0.264215,-0.012659,0.148227,0.326135,-0.089463,0.774473,-0.06622,0.313903,0.37586,-0.244919,0.580324,0.536092,-0.14099,0.408384,-0.414054,-0.733956,-0.889975,-0.309595,1.089945,0.991847,2.080721,-0.408033,1.31365,-0.075256,1997,954.8
4,CHANTEL TREMITIERE,SAC,0.13188,-0.053506,-0.21919,-0.254393,0.336064,-0.736501,-0.691807,1.018907,-0.447489,2.576602,-0.620741,-0.90495,-0.30235,0.17079,-0.660861,-0.372184,1.74558,0.423875,-0.544799,-0.53388,-0.309595,0.1706,-0.377678,1.418673,-0.096386,0.251783,0.317209,1997,1052.8


In [57]:
high_minutes_wnba = wnba_df[wnba_df['Total Minutes'] >= 100]
X = high_minutes_wnba.drop(['Player', 'Team', 'year', 'Total Minutes'], axis = 1)
print(X.shape)
X.head()

(3839, 27)


Unnamed: 0,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_MR,pct_PTS_FB,pct_PTS_FT,pct_PTS_IP,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,ITP_FGA,MR_FGA,LC3_FGA,RC3_FGA,AB3_FGA
0,-0.516092,0.669738,0.66362,-0.604152,-0.037217,-0.102525,0.109132,0.146817,1.007227,-0.569614,-1.119218,-0.800698,0.386819,0.161039,0.447429,0.392534,-0.335621,-1.010934,-0.087672,-0.177785,-0.309595,-0.041556,-0.72006,-0.662048,2.085147,0.67653,0.317209
1,0.115579,-0.035311,0.081656,3.014046,-0.493994,0.555107,-0.459022,0.765442,0.541718,0.437175,1.352397,1.224769,0.642066,0.317052,0.101643,-0.069853,1.01163,0.286134,-0.087672,-0.669536,-0.309595,2.292166,1.419823,1.891564,0.526909,1.31365,0.866659
2,-0.785062,0.969953,0.934875,2.314528,0.198539,-0.61349,-0.139435,0.417466,0.716284,0.059629,1.123929,0.867334,1.439715,0.560822,-0.190946,-0.345507,0.61952,0.53866,1.063029,-0.567794,-0.309595,1.584978,-0.292083,0.189156,1.461852,-0.172964,2.593504
3,0.392699,-0.344623,-0.342487,0.264215,-0.012659,0.148227,0.326135,-0.089463,0.774473,-0.06622,0.313903,0.37586,-0.244919,0.580324,0.536092,-0.14099,0.408384,-0.414054,-0.733956,-0.889975,-0.309595,1.089945,0.991847,2.080721,-0.408033,1.31365,-0.075256
4,0.13188,-0.053506,-0.21919,-0.254393,0.336064,-0.736501,-0.691807,1.018907,-0.447489,2.576602,-0.620741,-0.90495,-0.30235,0.17079,-0.660861,-0.372184,1.74558,0.423875,-0.544799,-0.53388,-0.309595,0.1706,-0.377678,1.418673,-0.096386,0.251783,0.317209


In [59]:
kmeans = KMeans(n_clusters=6, random_state=1)
kmeans.fit(X)
clusters = kmeans.predict(wnba_df[list(X.columns)])
wnba_df['Cluster'] = clusters
print(wnba_df.shape)
wnba_df.head()

(4508, 32)


Unnamed: 0,Player,Team,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_MR,pct_PTS_FB,pct_PTS_FT,pct_PTS_IP,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,ITP_FGA,MR_FGA,LC3_FGA,RC3_FGA,AB3_FGA,year,Total Minutes,Cluster
0,ANDREA CONGREAVES,CHA,-0.516092,0.669738,0.66362,-0.604152,-0.037217,-0.102525,0.109132,0.146817,1.007227,-0.569614,-1.119218,-0.800698,0.386819,0.161039,0.447429,0.392534,-0.335621,-1.010934,-0.087672,-0.177785,-0.309595,-0.041556,-0.72006,-0.662048,2.085147,0.67653,0.317209,1997,660.8,3
1,ANDREA STINSON,CHA,0.115579,-0.035311,0.081656,3.014046,-0.493994,0.555107,-0.459022,0.765442,0.541718,0.437175,1.352397,1.224769,0.642066,0.317052,0.101643,-0.069853,1.01163,0.286134,-0.087672,-0.669536,-0.309595,2.292166,1.419823,1.891564,0.526909,1.31365,0.866659,1997,1010.8,0
2,BRIDGET PETTIS,PHO,-0.785062,0.969953,0.934875,2.314528,0.198539,-0.61349,-0.139435,0.417466,0.716284,0.059629,1.123929,0.867334,1.439715,0.560822,-0.190946,-0.345507,0.61952,0.53866,1.063029,-0.567794,-0.309595,1.584978,-0.292083,0.189156,1.461852,-0.172964,2.593504,1997,831.6,0
3,BRIDGETTE GORDON,SAC,0.392699,-0.344623,-0.342487,0.264215,-0.012659,0.148227,0.326135,-0.089463,0.774473,-0.06622,0.313903,0.37586,-0.244919,0.580324,0.536092,-0.14099,0.408384,-0.414054,-0.733956,-0.889975,-0.309595,1.089945,0.991847,2.080721,-0.408033,1.31365,-0.075256,1997,954.8,4
4,CHANTEL TREMITIERE,SAC,0.13188,-0.053506,-0.21919,-0.254393,0.336064,-0.736501,-0.691807,1.018907,-0.447489,2.576602,-0.620741,-0.90495,-0.30235,0.17079,-0.660861,-0.372184,1.74558,0.423875,-0.544799,-0.53388,-0.309595,0.1706,-0.377678,1.418673,-0.096386,0.251783,0.317209,1997,1052.8,1


In [61]:
wnba_df[wnba_df['Player'] == '0']

Unnamed: 0,Player,Team,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_MR,pct_PTS_FB,pct_PTS_FT,pct_PTS_IP,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,ITP_FGA,MR_FGA,LC3_FGA,RC3_FGA,AB3_FGA,year,Total Minutes,Cluster


In [63]:
cluster_means = wnba_df.groupby("Cluster")[list(X.columns)].mean()

# Print sorted by usage rate (example)
cluster_means.sort_values("Usage", ascending=False)

Unnamed: 0_level_0,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_MR,pct_PTS_FB,pct_PTS_FT,pct_PTS_IP,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,ITP_FGA,MR_FGA,LC3_FGA,RC3_FGA,AB3_FGA
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
4,0.524128,-0.505067,-0.460321,-0.043427,-0.071226,0.268234,0.279598,-0.137627,0.416004,0.119514,1.368449,1.31486,-0.152847,0.879681,0.301684,0.585715,-0.117781,0.242324,0.255363,-0.327437,0.624808,1.29867,1.975644,1.766464,-0.228907,-0.261686,0.231462
0,-0.399541,0.45389,0.435898,0.381604,-0.080915,-0.451818,-0.671832,0.855208,0.341385,0.932313,0.986767,0.952543,0.97961,0.52834,-0.610451,-0.547357,0.662362,0.36182,-0.061713,-0.5416,0.340826,0.413566,0.537656,0.986275,0.713221,0.592729,1.437441
5,0.953183,-0.952777,-0.85454,-0.338722,0.084738,1.012574,0.524397,-0.40973,-0.604971,-0.34085,0.573474,0.38523,-0.933593,0.933228,1.089256,1.061053,-0.536256,0.274424,0.872334,0.449982,0.788089,1.256055,0.74692,-0.10061,-0.698023,-0.653076,-0.780551
1,-0.086975,0.035687,-0.077662,0.316933,0.155234,-0.170345,-0.30686,0.25538,0.394246,-0.066994,-0.340031,-0.357431,-0.033928,-0.28772,-0.417508,-0.449219,0.446386,0.081395,-0.213889,-0.172525,-0.205757,-0.377749,-0.377335,-0.221774,-0.070299,-0.079603,-0.207691
3,-1.352234,1.449001,1.469827,0.008667,-0.40161,-0.990807,-0.210019,0.138572,0.600768,0.096966,-0.451816,-0.251816,1.061787,-0.744398,-0.676599,-0.683375,0.111182,-0.36574,-0.560077,-0.289763,-0.545438,-0.735901,-0.679609,-0.473845,0.907763,0.930503,0.596126
2,0.861442,-0.933072,-0.882118,-0.411877,0.216892,0.719036,0.583468,-0.694712,-1.023416,-0.467635,-0.653946,-0.667245,-0.979595,-0.18105,0.764696,0.681723,-0.697866,-0.239031,0.207159,0.673991,-0.160751,-0.342885,-0.424475,-0.525444,-0.731413,-0.676839,-0.878705


In [65]:
wnba_df = wnba_df.sort_values(by = 'year', ascending = False)
for cluster_id in range(6):
    print(f"\nCluster {cluster_id} Players:")
    print(wnba_df[wnba_df["Cluster"] == cluster_id]["Player"].head(14))


Cluster 0 Players:
4400         KELSEY PLUM
4407        JACKIE YOUNG
4405     SABRINA IONESCU
4404       NATASHA CLOUD
4409         JEWELL LOYD
4398    ARIKE OGUNBOWALE
4410      KAHLEAH COPPER
4419       DIANA TAURASI
4366     KELSEY MITCHELL
4360      DEWANNA BONNER
4357       CAITLIN CLARK
4355        ARIEL ATKINS
4352        ALLISHA GRAY
4385          DANA EVANS
Name: Player, dtype: object

Cluster 1 Players:
4507            EZINNE KALU
4401          KENNEDY BURKE
4408         JADE MELBOURNE
4389         JORDAN HORSTON
4427     MICHAELA ONYENWERE
4418       STEPHANIE TALBOT
4417            RAE BURRELL
4363           JACY SHELDON
4362            HALEY JONES
4358    CECILIA ZANDALASINI
4369          LINDSAY ALLEN
4387          ERICA WHEELER
4379             SEVGI UZUN
4383           ALANNA SMITH
Name: Player, dtype: object

Cluster 2 Players:
4402               LI YUERU
4424           DORKA JUHÁSZ
4415    OLIVIA NELSON-ODODA
4413       MONIQUE BILLINGS
4412       MERCEDES RUSSELL
43

In [67]:
wnba_df['Archetype'] = ''
wnba_df.loc[wnba_df['Cluster'] == 0, 'Archetype'] = 'Primary On-Ball Creator'
wnba_df.loc[wnba_df['Cluster'] == 1, 'Archetype'] = 'Secondary Creator/Distributor'
wnba_df.loc[wnba_df['Cluster'] == 2, 'Archetype'] = 'Roller/Cutter'
wnba_df.loc[wnba_df['Cluster'] == 3, 'Archetype'] = 'Spot-up Shooter'
wnba_df.loc[wnba_df['Cluster'] == 4, 'Archetype'] = 'Perimeter-Oriented Big/Wing'
wnba_df.loc[wnba_df['Cluster'] == 5, 'Archetype'] = 'Interior Primary Option'

In [69]:
wnba_df = wnba_df[['Player', 'Team', 'Archetype', 'year']]
wnba_df.head()

Unnamed: 0,Player,Team,Archetype,year
4507,EZINNE KALU,ATL,Secondary Creator/Distributor,2024
4400,KELSEY PLUM,LVA,Primary On-Ball Creator,2024
4407,JACKIE YOUNG,LVA,Primary On-Ball Creator,2024
4406,EZI MAGBEGOR,SEA,Interior Primary Option,2024
4405,SABRINA IONESCU,NYL,Primary On-Ball Creator,2024


In [71]:
wnba_df[wnba_df['Player'] == "DIANA TAURASI"]

Unnamed: 0,Player,Team,Archetype,year
4419,DIANA TAURASI,PHO,Primary On-Ball Creator,2024
4301,DIANA TAURASI,PHO,Primary On-Ball Creator,2023
4101,DIANA TAURASI,PHO,Primary On-Ball Creator,2022
3994,DIANA TAURASI,PHO,Primary On-Ball Creator,2021
3807,DIANA TAURASI,PHO,Primary On-Ball Creator,2020
3714,DIANA TAURASI,PHO,Primary On-Ball Creator,2019
3452,DIANA TAURASI,PHO,Primary On-Ball Creator,2018
3333,DIANA TAURASI,PHO,Primary On-Ball Creator,2017
3149,DIANA TAURASI,PHO,Primary On-Ball Creator,2016
2844,DIANA TAURASI,PHO,Primary On-Ball Creator,2014


In [73]:
wnba_df.to_csv('Player_Archetypes.csv')

In [75]:
dump(kmeans, 'kmeans.joblib') # Save the model

['kmeans.joblib']