In [668]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans

In [670]:
def scale_data(year):
    yearMinutes = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_players/wnba_players_{year}.csv')
    yearMinutes = yearMinutes.drop_duplicates(subset = ['Name', 'Team'], keep = 'first')
    tot_min = yearMinutes['Total Minutes']
    yearMinutes = yearMinutes[['Name', 'Team', 'EFG%', 'TS%']]
    
    yearShooting = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_shooting/wnba_players_{year}_shooting.csv')
    yearShooting = yearShooting.drop_duplicates(subset = ['Player', 'Team'], keep = 'first')
    
    yearUsage = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_usage/wnba_players_{year}_usage.csv')
    yearUsage = yearUsage.drop_duplicates(subset = ['player', 'team'], keep = 'first')
    
    yearShootingLocation = pd.read_csv(f'/Users/austincoffelt/Documents/WNBA_shooting_location/wnba_players_{year}_shooting_location.csv')
    yearShootingLocation = yearShootingLocation.drop_duplicates(subset = ['Player', 'Team'], keep = 'first')

    dfYear = pd.merge(yearShooting, yearUsage, left_on = ['Player', 'Team'], right_on = ['player', 'team'], how = 'inner', validate='one_to_one')
    dfYear = pd.merge(dfYear, yearShootingLocation, on = ['Player', 'Team'], how = 'inner', validate='one_to_one')
    dfYear = pd.merge(dfYear, yearMinutes, left_on = ['Player', 'Team'], right_on = ['Name', 'Team'], how = 'inner', validate='one_to_one')

    dfYear = dfYear.drop(['player', 'team', 'Age', 'pct_PTS_IP', 'pct_PTS_MR', 'RA_FGM', 'ITP_FGM', 'MR_FGM', 'LC3_FGM', 'RC3_FGM', 'AB3_FGM', 'Name'], axis = 1)

    cols_to_convert = [col for col in dfYear.columns if col not in ['Player', 'Team']]
    dfYear[cols_to_convert] = dfYear[cols_to_convert].apply(pd.to_numeric, errors='coerce').fillna(0)

    dfYear.iloc[:, 2:] = scale(dfYear.iloc[:, 2:])
    dfYear['year'] = year
    dfYear['Total Minutes'] = tot_min

    dfYear.to_csv(f'WNBA_scaled_means_data/WNBA_scaled_{year}.csv', index = False)

In [672]:
for year in list(range(1997, 2025)):
    scale_data(year)

In [674]:
# Generate the list of file paths
file_paths = [f'WNBA_scaled_means_data/WNBA_scaled_{year}.csv' for year in range(1997, 2025)]

# Concatenate all files into one DataFrame
wnba_df = pd.concat((pd.read_csv(f) for f in file_paths), ignore_index=True)

print(wnba_df.shape)
wnba_df.head()

(4508, 37)


Unnamed: 0,Player,Team,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_FB,pct_PTS_FT,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,RA_FG%,ITP_FGA,ITP_FG%,MR_FGA,MR_FG%,LC3_FGA,LC3_FG%,RC3_FGA,RC3_FG%,AB3_FGA,AB3_FG%,EFG%,TS%,year,Total Minutes
0,ANDREA CONGREAVES,CHA,-0.516092,0.669738,-0.604152,-0.037217,0.109132,0.146817,1.007227,-0.569614,-1.119218,-0.800698,0.386819,0.161039,0.447429,0.392534,-0.335621,-1.010934,-0.087672,-0.177785,-0.309595,-0.041556,0.808964,-0.72006,0.753345,-0.662048,0.042366,2.085147,1.10156,0.67653,3.317402,0.317209,0.662691,0.918887,0.983949,1997,660.8
1,ANDREA STINSON,CHA,0.115579,-0.035311,3.014046,-0.493994,-0.459022,0.765442,0.541718,0.437175,1.352397,1.224769,0.642066,0.317052,0.101643,-0.069853,1.01163,0.286134,-0.087672,-0.669536,-0.309595,2.292166,0.7465,1.419823,0.308431,1.891564,0.00133,0.526909,1.882828,1.31365,0.164139,0.866659,0.355623,0.362077,0.288716,1997,1010.8
2,BRIDGET PETTIS,PHO,-0.785062,0.969953,2.314528,0.198539,-0.139435,0.417466,0.716284,0.059629,1.123929,0.867334,1.439715,0.560822,-0.190946,-0.345507,0.61952,0.53866,1.063029,-0.567794,-0.309595,1.584978,-0.190464,-0.292083,-0.896974,0.189156,-0.274203,1.461852,0.32497,-0.172964,-0.468027,2.593504,0.329673,-0.122106,0.146104,1997,831.6
3,BRIDGETTE GORDON,SAC,0.392699,-0.344623,0.264215,-0.012659,0.326135,-0.089463,0.774473,-0.06622,0.313903,0.37586,-0.244919,0.580324,0.536092,-0.14099,0.408384,-0.414054,-0.733956,-0.889975,-0.309595,1.089945,0.635986,0.991847,0.236004,2.080721,0.282724,-0.408033,-0.456298,1.31365,0.792521,-0.075256,0.074504,0.180508,0.312484,1997,954.8
4,CHANTEL TREMITIERE,SAC,0.13188,-0.053506,-0.254393,0.336064,-0.691807,1.018907,-0.447489,2.576602,-0.620741,-0.90495,-0.30235,0.17079,-0.660861,-0.372184,1.74558,0.423875,-0.544799,-0.53388,-0.309595,0.1706,-0.344222,-0.377678,-0.126135,1.418673,0.54067,-0.096386,1.882828,0.251783,-0.468027,0.317209,-0.063893,-0.2371,0.003492,1997,1052.8


In [676]:
high_minutes_wnba = wnba_df[wnba_df['Total Minutes'] >= 100]
X = high_minutes_wnba.drop(['Player', 'Team', 'year', 'Total Minutes'], axis = 1)
print(X.shape)
X.head()

(3839, 33)


Unnamed: 0,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_FB,pct_PTS_FT,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,RA_FG%,ITP_FGA,ITP_FG%,MR_FGA,MR_FG%,LC3_FGA,LC3_FG%,RC3_FGA,RC3_FG%,AB3_FGA,AB3_FG%,EFG%,TS%
0,-0.516092,0.669738,-0.604152,-0.037217,0.109132,0.146817,1.007227,-0.569614,-1.119218,-0.800698,0.386819,0.161039,0.447429,0.392534,-0.335621,-1.010934,-0.087672,-0.177785,-0.309595,-0.041556,0.808964,-0.72006,0.753345,-0.662048,0.042366,2.085147,1.10156,0.67653,3.317402,0.317209,0.662691,0.918887,0.983949
1,0.115579,-0.035311,3.014046,-0.493994,-0.459022,0.765442,0.541718,0.437175,1.352397,1.224769,0.642066,0.317052,0.101643,-0.069853,1.01163,0.286134,-0.087672,-0.669536,-0.309595,2.292166,0.7465,1.419823,0.308431,1.891564,0.00133,0.526909,1.882828,1.31365,0.164139,0.866659,0.355623,0.362077,0.288716
2,-0.785062,0.969953,2.314528,0.198539,-0.139435,0.417466,0.716284,0.059629,1.123929,0.867334,1.439715,0.560822,-0.190946,-0.345507,0.61952,0.53866,1.063029,-0.567794,-0.309595,1.584978,-0.190464,-0.292083,-0.896974,0.189156,-0.274203,1.461852,0.32497,-0.172964,-0.468027,2.593504,0.329673,-0.122106,0.146104
3,0.392699,-0.344623,0.264215,-0.012659,0.326135,-0.089463,0.774473,-0.06622,0.313903,0.37586,-0.244919,0.580324,0.536092,-0.14099,0.408384,-0.414054,-0.733956,-0.889975,-0.309595,1.089945,0.635986,0.991847,0.236004,2.080721,0.282724,-0.408033,-0.456298,1.31365,0.792521,-0.075256,0.074504,0.180508,0.312484
4,0.13188,-0.053506,-0.254393,0.336064,-0.691807,1.018907,-0.447489,2.576602,-0.620741,-0.90495,-0.30235,0.17079,-0.660861,-0.372184,1.74558,0.423875,-0.544799,-0.53388,-0.309595,0.1706,-0.344222,-0.377678,-0.126135,1.418673,0.54067,-0.096386,1.882828,0.251783,-0.468027,0.317209,-0.063893,-0.2371,0.003492


In [678]:
kmeans = KMeans(n_clusters=6, random_state=1)
kmeans.fit(X)
clusters = kmeans.predict(wnba_df[list(X.columns)])
wnba_df['Cluster'] = clusters
print(wnba_df.shape)
wnba_df.head()

(4508, 38)


Unnamed: 0,Player,Team,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_FB,pct_PTS_FT,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,RA_FG%,ITP_FGA,ITP_FG%,MR_FGA,MR_FG%,LC3_FGA,LC3_FG%,RC3_FGA,RC3_FG%,AB3_FGA,AB3_FG%,EFG%,TS%,year,Total Minutes,Cluster
0,ANDREA CONGREAVES,CHA,-0.516092,0.669738,-0.604152,-0.037217,0.109132,0.146817,1.007227,-0.569614,-1.119218,-0.800698,0.386819,0.161039,0.447429,0.392534,-0.335621,-1.010934,-0.087672,-0.177785,-0.309595,-0.041556,0.808964,-0.72006,0.753345,-0.662048,0.042366,2.085147,1.10156,0.67653,3.317402,0.317209,0.662691,0.918887,0.983949,1997,660.8,5
1,ANDREA STINSON,CHA,0.115579,-0.035311,3.014046,-0.493994,-0.459022,0.765442,0.541718,0.437175,1.352397,1.224769,0.642066,0.317052,0.101643,-0.069853,1.01163,0.286134,-0.087672,-0.669536,-0.309595,2.292166,0.7465,1.419823,0.308431,1.891564,0.00133,0.526909,1.882828,1.31365,0.164139,0.866659,0.355623,0.362077,0.288716,1997,1010.8,0
2,BRIDGET PETTIS,PHO,-0.785062,0.969953,2.314528,0.198539,-0.139435,0.417466,0.716284,0.059629,1.123929,0.867334,1.439715,0.560822,-0.190946,-0.345507,0.61952,0.53866,1.063029,-0.567794,-0.309595,1.584978,-0.190464,-0.292083,-0.896974,0.189156,-0.274203,1.461852,0.32497,-0.172964,-0.468027,2.593504,0.329673,-0.122106,0.146104,1997,831.6,0
3,BRIDGETTE GORDON,SAC,0.392699,-0.344623,0.264215,-0.012659,0.326135,-0.089463,0.774473,-0.06622,0.313903,0.37586,-0.244919,0.580324,0.536092,-0.14099,0.408384,-0.414054,-0.733956,-0.889975,-0.309595,1.089945,0.635986,0.991847,0.236004,2.080721,0.282724,-0.408033,-0.456298,1.31365,0.792521,-0.075256,0.074504,0.180508,0.312484,1997,954.8,0
4,CHANTEL TREMITIERE,SAC,0.13188,-0.053506,-0.254393,0.336064,-0.691807,1.018907,-0.447489,2.576602,-0.620741,-0.90495,-0.30235,0.17079,-0.660861,-0.372184,1.74558,0.423875,-0.544799,-0.53388,-0.309595,0.1706,-0.344222,-0.377678,-0.126135,1.418673,0.54067,-0.096386,1.882828,0.251783,-0.468027,0.317209,-0.063893,-0.2371,0.003492,1997,1052.8,4


In [680]:
wnba_df[wnba_df['Player'] == '0']

Unnamed: 0,Player,Team,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_FB,pct_PTS_FT,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,RA_FG%,ITP_FGA,ITP_FG%,MR_FGA,MR_FG%,LC3_FGA,LC3_FG%,RC3_FGA,RC3_FG%,AB3_FGA,AB3_FG%,EFG%,TS%,year,Total Minutes,Cluster


In [682]:
cluster_means = wnba_df.groupby("Cluster")[list(X.columns)].mean()

# Print sorted by usage rate (example)
cluster_means.sort_values("Usage", ascending=False)

Unnamed: 0_level_0,pct_FGA_2PT,pct_FGA_3PT,pct_PTS_FB,pct_PTS_FT,pct_AST_2PT,pct_UAST_2PT,pct_AST_3PT,pct_UAST_3PT,Usage,pct_FGA,pct_3PA,pct_FTA,pct_OREB,pct_REB,pct_AST,pct_TOV,pct_BLA,pct_PF,pct_PFD,RA_FGA,RA_FG%,ITP_FGA,ITP_FG%,MR_FGA,MR_FG%,LC3_FGA,LC3_FG%,RC3_FGA,RC3_FG%,AB3_FGA,AB3_FG%,EFG%,TS%
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,-0.275036,0.325384,0.355338,-0.083092,-0.43782,0.611134,0.43341,0.695439,1.127582,1.118289,0.889191,0.638617,-0.474409,-0.382484,0.406856,0.228754,-0.032068,-0.558644,0.414481,0.605839,0.249171,0.814214,0.293924,1.319586,0.326108,0.712496,0.662389,0.578137,0.692199,1.429213,0.577764,0.282296,0.353262
3,0.848111,-0.842914,-0.275556,-0.027513,0.532127,-0.382814,-0.05841,-0.083711,1.010634,0.875711,-0.726998,1.00196,0.902792,1.061148,-0.383542,0.272836,0.649711,0.09873,0.753135,1.70973,0.430551,1.629864,0.373814,0.709754,0.256195,-0.59399,-0.437684,-0.573624,-0.392077,-0.474709,-0.030489,0.462678,0.497427
1,0.169349,-0.135823,-0.072715,-0.238507,0.592378,-0.458578,0.739756,-0.308489,-0.253485,-0.166118,-0.179692,-0.272684,0.271439,0.231396,-0.394146,-0.212121,0.016961,0.18646,-0.202733,-0.226988,0.051289,-0.283201,0.034791,-0.14744,0.108809,-0.265193,-0.119844,-0.227312,-0.20278,-0.276241,0.50384,-0.014602,-0.056217
4,-0.375433,0.29422,0.355226,0.340505,-1.055425,0.874605,0.112899,0.463589,-0.309585,-0.422146,0.164949,-0.305512,-0.808364,-0.788018,1.173812,0.451954,-0.310558,-0.295405,-0.139348,-0.435275,-0.407254,-0.382517,-0.262014,-0.279351,-0.186638,-0.160414,0.063957,-0.136773,-0.093552,-0.052053,0.056444,-0.554736,-0.510706
5,-1.24513,1.335701,0.112783,-0.451536,-0.089009,0.098382,0.668287,0.005232,-0.42311,-0.20251,1.015897,-0.689657,-0.62751,-0.661516,0.009292,-0.468453,-0.534361,-0.363862,-0.545782,-0.675428,-0.068952,-0.629079,-0.064064,-0.414676,0.001628,1.08693,0.606238,1.069011,0.699715,0.58962,0.556739,0.245725,0.139175
2,0.942521,-1.013656,-0.403387,0.346181,0.47141,-0.621067,-1.401512,-0.540801,-0.453357,-0.539253,-1.04474,0.091984,0.796629,0.702588,-0.687073,-0.086392,0.364371,0.696532,0.064236,-0.171239,-0.040951,-0.304505,-0.143845,-0.538052,-0.270642,-0.74747,-0.67163,-0.700772,-0.611421,-0.936204,-1.207816,-0.175244,-0.154692


In [684]:
wnba_df = wnba_df.sort_values(by = 'year', ascending = False)
for cluster_id in range(6):
    print(f"\nCluster {cluster_id} Players:")
    print(wnba_df[wnba_df["Cluster"] == cluster_id]["Player"].head(14))


Cluster 0 Players:
4400          KELSEY PLUM
4407         JACKIE YOUNG
4405      SABRINA IONESCU
4409          JEWELL LOYD
4398     ARIKE OGUNBOWALE
4410       KAHLEAH COPPER
4419        DIANA TAURASI
4359    COURTNEY WILLIAMS
4366      KELSEY MITCHELL
4360       DEWANNA BONNER
4357        CAITLIN CLARK
4355         ARIEL ATKINS
4352         ALLISHA GRAY
4381       SKYLAR DIGGINS
Name: Player, dtype: object

Cluster 1 Players:
4401          KENNEDY BURKE
4392            KIAH STOKES
4389         JORDAN HORSTON
4424           DORKA JUHÁSZ
4421       VICTORIA VIVIANS
4418       STEPHANIE TALBOT
4417            RAE BURRELL
4413       MONIQUE BILLINGS
4388          JONQUEL JONES
4362            HALEY JONES
4358    CECILIA ZANDALASINI
4383           ALANNA SMITH
4377         RICKEA JACKSON
4376             NIA COFFEY
Name: Player, dtype: object

Cluster 2 Players:
4402               LI YUERU
4422        AALIYAH EDWARDS
4415    OLIVIA NELSON-ODODA
4412       MERCEDES RUSSELL
4390           K

In [686]:
wnba_df['Archetype'] = ''
wnba_df.loc[wnba_df['Cluster'] == 0, 'Archetype'] = 'Primary On-Ball Creator'
wnba_df.loc[wnba_df['Cluster'] == 1, 'Archetype'] = 'Perimeter-Oriented Big'
wnba_df.loc[wnba_df['Cluster'] == 2, 'Archetype'] = 'Roller/Cutter'
wnba_df.loc[wnba_df['Cluster'] == 3, 'Archetype'] = 'Interior Primary Option'
wnba_df.loc[wnba_df['Cluster'] == 4, 'Archetype'] = 'Secondary Creator/Distributor'
wnba_df.loc[wnba_df['Cluster'] == 5, 'Archetype'] = 'Spot-up Shooter'

In [688]:
wnba_df = wnba_df[['Player', 'Team', 'Archetype', 'year']]
wnba_df.head()

Unnamed: 0,Player,Team,Archetype,year
4507,EZINNE KALU,ATL,Secondary Creator/Distributor,2024
4400,KELSEY PLUM,LVA,Primary On-Ball Creator,2024
4407,JACKIE YOUNG,LVA,Primary On-Ball Creator,2024
4406,EZI MAGBEGOR,SEA,Interior Primary Option,2024
4405,SABRINA IONESCU,NYL,Primary On-Ball Creator,2024


In [704]:
wnba_df[wnba_df['Player'] == "DIANA TAURASI"]

Unnamed: 0,Player,Team,Archetype,year
4419,DIANA TAURASI,PHO,Primary On-Ball Creator,2024
4301,DIANA TAURASI,PHO,Primary On-Ball Creator,2023
4101,DIANA TAURASI,PHO,Primary On-Ball Creator,2022
3994,DIANA TAURASI,PHO,Primary On-Ball Creator,2021
3807,DIANA TAURASI,PHO,Primary On-Ball Creator,2020
3714,DIANA TAURASI,PHO,Secondary Creator/Distributor,2019
3452,DIANA TAURASI,PHO,Primary On-Ball Creator,2018
3333,DIANA TAURASI,PHO,Primary On-Ball Creator,2017
3149,DIANA TAURASI,PHO,Primary On-Ball Creator,2016
2844,DIANA TAURASI,PHO,Primary On-Ball Creator,2014


In [706]:
wnba_df.to_csv('Player_Archetypes.csv')