In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from google.colab import files

# Note: this code is guided by the PCA analysis code on https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [3]:
# Read-in data
#new data set (made from scratch in Excel):
df_mnba_general = pd.read_csv("https://raw.githubusercontent.com/aakap/nba_wnba_salarycomparisons/main/mnba-player-and-salary-stats-2020.csv")
df_wnba_general = pd.read_csv("https://raw.githubusercontent.com/aakap/nba_wnba_salarycomparisons/main/wnba-player-and-salary-stats-2020.csv")

#old data: 
#df_wnba_general = pd.read_csv("https://raw.githubusercontent.com/Bengis/nba-wnba-salary-gap/master/data/wnba-stats_out.csv")
#df_mnba_general = pd.read_csv("https://raw.githubusercontent.com/Bengis/nba-wnba-salary-gap/master/data/nba-stats_out.csv")
df_wnba_general
df_mnba_general

Unnamed: 0,player,games,minutes,points,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,rebds.,assists,steals,blocks,TOV,PF,EFF,AST/,STL/,salary
0,Stephen Curry,56,1905,1751,572,1178,48.6,291,683,42.6,316,345,91.6,25,285,310,323,67,7,188,103,1635,1.72,0.36,43006362
1,Bradley Beal,55,1952,1707,607,1254,48.4,119,343,34.7,374,415,90.1,64,194,258,246,64,19,175,130,1431,1.41,0.37,28751775
2,Nikola Jokic,64,2243,1679,654,1152,56.8,89,216,41.2,282,328,86.0,182,518,700,546,89,43,200,168,2313,2.73,0.45,28542009
3,Damian Lillard,59,2116,1668,517,1187,43.6,238,630,37.8,396,427,92.7,29,222,251,446,55,16,184,96,1551,2.42,0.30,31626953
4,Luka Doncic,58,2040,1661,590,1214,48.6,171,484,35.3,310,426,72.8,47,416,463,515,58,34,250,138,1741,2.06,0.23,8049360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,Ignas Brazdeikis,5,15,2,0,4,0.0,0,1,0.0,2,2,100.0,1,3,4,1,0,0,1,0,2,1.00,0.00,1517981
416,Iman Shumpert,2,11,2,1,4,25.0,0,3,0.0,0,0,0.0,1,0,1,0,1,0,1,0,0,0.00,1.00,447155
417,T.J. Leaf,2,13,2,1,2,50.0,0,0,0.0,0,0,0.0,0,0,0,0,1,1,1,0,2,0.00,1.00,4326825
418,Terrance Ferguson,13,49,2,1,7,14.3,0,5,0.0,0,0,0.0,0,1,1,2,1,0,4,7,-4,0.50,0.25,3944013


In [4]:
# Modify df columns

df_wnba_general['points/min'] = df_wnba_general['points'] / df_wnba_general['minutes']
df_wnba_general['assists/min'] = df_wnba_general['assists'] / df_wnba_general['minutes']
df_wnba_general['rebounds/min'] = df_wnba_general['rebds.'] / df_wnba_general['minutes']
df_wnba_general['minutes/game'] = df_wnba_general['minutes'] / df_wnba_general['games'] * 48/40 #Scale to account for difference in length of WNBA and NBA games
df_wnba_general = df_wnba_general.replace([np.inf, -np.inf], np.nan)
df_wnba_general.dropna()
print(df_wnba_general)

df_mnba_general['points/min'] = df_mnba_general['points'] / df_mnba_general['minutes']
df_mnba_general['assists/min'] = df_mnba_general['assists'] / df_mnba_general['minutes']
df_mnba_general['rebounds/min'] = df_mnba_general['rebds.'] / df_mnba_general['minutes']
df_mnba_general['minutes/game'] = df_mnba_general['minutes'] / df_mnba_general['games']
df_mnba_general = df_mnba_general.replace([np.inf, -np.inf], np.nan)
df_mnba_general.dropna()
print(df_mnba_general)

               player team position  ...  assists/min  rebounds/min  minutes/game
0    Arike Ogunbowale  Dal        G  ...     0.100267      0.081551     40.800000
1         A'ja Wilson  LVA        F  ...     0.064378      0.268956     38.127273
2      DeWanna Bonner  Con        G  ...     0.088798      0.234973     39.927273
3     Breanna Stewart  Sea        F  ...     0.118616      0.271829     36.420000
4     Kelsey Mitchell  Ind        G  ...     0.087819      0.067989     38.509091
..                ...  ...      ...  ...          ...           ...           ...
137       Emma Cannon  LVA        F  ...          NaN           NaN      0.000000
138    Alisia Jenkins  Chi        F  ...     0.000000      0.000000      1.200000
139    Alisia Jenkins  Ind        F  ...     0.000000      0.500000      2.400000
140       Kaela Davis  Atl        G  ...     0.000000      0.000000      1.200000
141      Erica McCall  Atl        F  ...     0.000000      0.400000      6.000000

[142 rows x 27 

In [5]:
from sklearn.preprocessing import StandardScaler as wnba_StandardScaler
from sklearn.decomposition import PCA as wnba_PCA

# Standardizing Data
wnba_features = ['minutes/game', 'points/min', 'assists/min', 'rebounds/min']
wnba_x = df_wnba_general.loc[:, wnba_features].dropna().values
wnba_y = df_wnba_general.loc[:, ['salary']].dropna().values
wnba_x = wnba_StandardScaler().fit_transform(wnba_x)

# PCA Processing
wnba_pca = wnba_PCA(n_components = 4)
wnba_principalComponents = wnba_pca.fit_transform(wnba_x)
wnba_df_principal = pd.DataFrame(data = wnba_principalComponents, columns=['pc_1', 'pc_2', 'pc_3', 'pc_4'])
wnba_df_final = pd.concat([wnba_df_principal, df_wnba_general[['salary']]], axis = 1)
wnba_results = wnba_pca.explained_variance_ratio_
print(wnba_results)

[0.52000132 0.27190841 0.12897109 0.07911918]


In [6]:
from sklearn.preprocessing import StandardScaler as mnba_StandardScaler
from sklearn.decomposition import PCA as mnba_PCA

# Standardizing Data
mnba_features = ['minutes/game', 'points/min', 'assists/min', 'rebounds/min']
mnba_x = df_mnba_general.loc[:, mnba_features].dropna().values
mnba_y = df_mnba_general.loc[:, ['salary']].dropna().values
mnba_x = mnba_StandardScaler().fit_transform(mnba_x)

# PCA Processing
mnba_pca = mnba_PCA(n_components = 4)
mnba_principalComponents = mnba_pca.fit_transform(mnba_x)
mnba_df_principal = pd.DataFrame(data = mnba_principalComponents, columns=['pc_1', 'pc_2', 'pc_3', 'pc_4'])
mnba_df_final = pd.concat([mnba_df_principal, df_mnba_general[['salary']]], axis = 1)
mnba_results = mnba_pca.explained_variance_ratio_
print(mnba_results)

[0.4856832  0.27166118 0.1420279  0.10062772]


In [7]:
# Using results to form all-encompassing player performance metric
df_wnba_general['Stat'] = wnba_results[0] * df_wnba_general['points/min'] + wnba_results[1] * df_wnba_general['assists/min'] + wnba_results[2] * df_wnba_general['rebounds/min'] + wnba_results[3] * df_wnba_general['minutes']
df_mnba_general['Stat'] = mnba_results[0] * df_mnba_general['points/min'] + mnba_results[1] * df_mnba_general['assists/min'] + mnba_results[2] * df_mnba_general['rebounds/min'] + mnba_results[3] * df_mnba_general['minutes']

df_wnba_general = df_wnba_general.sort_values(by=['Stat'], ascending=False)
df_mnba_general = df_mnba_general.sort_values(by=['Stat'], ascending=False)

df_wnba_general.to_csv('WNBA Stats.csv')
files.download('WNBA Stats.csv')

df_mnba_general.to_csv('MNBA Stats.csv')
files.download('MNBA Stats.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
# Comining MNBA and WNBA PCA results by averaging each weight
pca_weighted_results = list()
sum = 0
for i in range(0, 4):
  pca_weighted_results.append((wnba_results[i] + mnba_results[i])/2)
  sum += ((wnba_results[i] + mnba_results[i])/2)
print(pca_weighted_results)
print(sum) # Make sure sum is very close to 1

[0.5028422613056367, 0.271784794225057, 0.1354994985248121, 0.08987344594449426]
1.0000000000000002


In [9]:
# Interpreting PCA results
# Combine NBA and WNBA data, taking note of which data set each entry belongs to
df_overall_general = pd.DataFrame(columns=['player', 'League', 'minutes/game', 'points/min', 'assists/min', 'rebounds/min', 'salary'])
df_wnba_general['League'] = "WNBA"
df_mnba_general['League'] = "NBA"
df_wnba_new = df_wnba_general[['player', 'League', 'minutes/game', 'points/min', 'assists/min', 'rebounds/min', 'salary']]
df_mnba_new = df_mnba_general[['player', 'League', 'minutes/game', 'points/min', 'assists/min', 'rebounds/min', 'salary']]
df_wnba_new.dropna()
df_mnba_new.dropna()

minutes_mean = df_wnba_new['minutes/game'].mean()
minutes_std = df_wnba_new['minutes/game'].std(ddof=0)
pointsPerMin_mean = df_wnba_new['points/min'].mean()
pointsPerMin_std = df_wnba_new['points/min'].std()
assistsPerMin_mean = df_wnba_new['assists/min'].mean()
assistsPerMin_std = df_wnba_new['assists/min'].std()
reboundsPerMin_mean = df_wnba_new['rebounds/min'].mean()
reboundsPerMin_std = df_wnba_new['rebounds/min'].std()
for index, row in df_wnba_new.iterrows():
  df_wnba_new.at[index, 'minutes/game'] = (row['minutes/game'] - minutes_mean)/(minutes_std)
  df_wnba_new.at[index, 'points/min'] = (row['points/min'] - pointsPerMin_mean)/(pointsPerMin_std)
  df_wnba_new.at[index, 'assists/min'] = (row['assists/min'] - assistsPerMin_mean)/(assistsPerMin_std)
  df_wnba_new.at[index, 'rebounds/min'] = (row['rebounds/min'] - reboundsPerMin_mean)/(reboundsPerMin_std)
minutes_mean = df_mnba_new['minutes/game'].mean()
minutes_std = df_mnba_new['minutes/game'].std(ddof=0)
pointsPerMin_mean = df_mnba_new['points/min'].mean()
pointsPerMin_std = df_mnba_new['points/min'].std()
assistsPerMin_mean = df_mnba_new['assists/min'].mean()
assistsPerMin_std = df_mnba_new['assists/min'].std()
reboundsPerMin_mean = df_mnba_new['rebounds/min'].mean()
reboundsPerMin_std = df_mnba_new['rebounds/min'].std()
for index, row in df_mnba_new.iterrows():
  df_mnba_new.at[index, 'minutes/game'] = (row['minutes/game'] - minutes_mean)/(minutes_std)
  df_mnba_new.at[index, 'points/min'] = (row['points/min'] - pointsPerMin_mean)/(pointsPerMin_std)
  df_mnba_new.at[index, 'assists/min'] = (row['assists/min'] - assistsPerMin_mean)/(assistsPerMin_std)
  df_mnba_new.at[index, 'rebounds/min'] = (row['rebounds/min'] - reboundsPerMin_mean)/(reboundsPerMin_std)
df_overall_general = pd.concat([df_wnba_new, df_mnba_new])
df_overall_general['projected_value'] = pca_weighted_results[0] * df_overall_general['minutes/game'] + pca_weighted_results[1] * df_overall_general['points/min'] + pca_weighted_results[2] * df_overall_general['assists/min'] + pca_weighted_results[3] * df_overall_general['rebounds/min']
df_overall_general = df_overall_general.replace([np.inf, -np.inf], np.nan)
df_overall_general.dropna()
df_overall_general.sort_values(by=['projected_value'], ascending=False)
df_overall_general.to_csv('players-projected-salary-rankings.csv')
files.download('players-projected-salary-rankings.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
df_overall_general_top_wnba = df_overall_general[df_overall_general['League'] == "WNBA"].sort_values(by='projected_value', ascending = False).head(15)
px.bar(df_overall_general_top_wnba, y='projected_value', x='player', color = 'League')

df_overall_general_top_mnba = df_overall_general[df_overall_general['League'] == "NBA"].sort_values(by='projected_value', ascending = False).head(15)
px.bar(df_overall_general_top_mnba, y='projected_value', x='player', color = 'League')

In [11]:
# Visuals for PCA Analysis
projectedValueVersusSalary = px.scatter(df_overall_general, x = 'projected_value', y = 'salary', hover_name = 'player', color = 'League', title = "Actual Player Salary vs. Projected Player Value Differentiated by League")
projectedValueVersusSalary.write_html("actual-player-salary-versus-projected-player-value-differentiated-by-league")
files.download("actual-player-salary-versus-projected-player-value-differentiated-by-league")
projectedValueVersusSalary

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
projected_value_ranges = [-1.5, -1, -0.5, 0, 0.5, 1, 1.5]
wnbaAverageByBin = list()
mnbaAverageByBin = list()
for rangeValue in projected_value_ranges:
  wnbaSum = 0
  wnbaCount = 0
  mnbaSum = 0
  mnbaCount = 0
  for index, row in df_overall_general.iterrows():
    if (row['projected_value'] >= rangeValue) and (row['projected_value'] < rangeValue + 0.5):
      if row['League'] == "WNBA":
        wnbaSum += row['salary']
        wnbaCount += 1
      elif row['League'] == "NBA":
        mnbaSum += row['salary']
        mnbaCount += 1
  if (wnbaCount > 0):
    wnbaAverageByBin.append(wnbaSum / wnbaCount)
  else:
    wnbaAverageByBin.append(0)
  if mnbaCount > 0:
    mnbaAverageByBin.append(mnbaSum / mnbaCount)
  else:
    mnbaAverageByBin.append(0)

df_discrepancies_by_bin = pd.DataFrame(columns=['bin', 'discrepancy'])
df_discrepancies_by_bin_percentage = pd.DataFrame(columns=['bin', 'discrepancy_percentage'])

print(wnbaAverageByBin)

binValue = -1.25
for index in range(7):
  temp = [binValue, abs(mnbaAverageByBin[index] - wnbaAverageByBin[index])]
  print(abs(mnbaAverageByBin[index] - wnbaAverageByBin[index]))
  df_discrepancies_by_bin.loc[len(df_discrepancies_by_bin)] = temp

  if(wnbaAverageByBin[index] != 0):
    tempPercentage = [binValue, abs((mnbaAverageByBin[index] - wnbaAverageByBin[index])/(wnbaAverageByBin[index]))]
    df_discrepancies_by_bin_percentage.loc[len(df_discrepancies_by_bin)] = tempPercentage

  binValue += 0.5
df_discrepancies_by_bin
salary_discrepancies = px.bar(df_discrepancies_by_bin, x='bin', y='discrepancy')
salary_discrepancies_percentages = px.bar(df_discrepancies_by_bin_percentage, x='bin', y='discrepancy_percentage', labels = {'bin': 'projected value grouping'})
salary_discrepancies_percentages.layout.yaxis.tickformat = ',.0%'
salary_discrepancies_percentages

[68568.57142857143, 66145.92592592593, 81301.52777777778, 83381.13043478261, 120981.13333333333, 125428.57142857143, 0]
1975803.6285714286
2983603.1530214427
4453278.691269841
7845063.057684029
14203854.462820513
22805077.77142857
29736342.0


In [13]:
#Create a series of linear regressions to predict salary based on metrics


In [14]:
#compare games played vs salary (mens)
import statsmodels.api as sm # import statsmodels 
import numpy as np
#df_wnba_general = df_wnba_general.astype(np.float)
#df_nba_general=df_nba_general.astype(np.float)

X = df_mnba_general[["games"]] #initialize independent variables
y = df_mnba_general["salary"] #initialize dependent variable
model_gamevsalary_nba = sm.OLS(y, X).fit() #define least of squares regression model
predictions_gamevsalary_nba = model_gamevsalary_nba.predict(X) #run model
model_gamevsalary_nba.summary() #print results 





pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



0,1,2,3
Dep. Variable:,salary,R-squared (uncentered):,0.462
Model:,OLS,Adj. R-squared (uncentered):,0.461
Method:,Least Squares,F-statistic:,360.1
Date:,"Sat, 08 May 2021",Prob (F-statistic):,2.09e-58
Time:,19:06:42,Log-Likelihood:,-7332.8
No. Observations:,420,AIC:,14670.0
Df Residuals:,419,BIC:,14670.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
games,1.849e+05,9745.087,18.977,0.000,1.66e+05,2.04e+05

0,1,2,3
Omnibus:,123.963,Durbin-Watson:,1.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,257.483
Skew:,1.59,Prob(JB):,1.23e-56
Kurtosis:,5.146,Cond. No.,1.0


In [16]:
gamesvsalary_nba = px.scatter(df_mnba_general, x = "games",
                            y = "salary",
                            title = "Salary vs Games Played for Men's Basketball")
gamesvsalary_nba

In [17]:

gamesvsalary_nba_Box = px.box(df_mnba_general, x = "games",
                            y = "salary",
                            title = "Salary vs Games Played for Men's Basketball")
gamesvsalary_nba_Box


In [None]:
#compare games played vs salary (womens)

X = df_wnba_general[["games"]] #initialize independent variables
y = df_wnba_general["salary"] #initialize dependent variable
model_gamevsalary_wnba = sm.OLS(y, X).fit() #define least of squares regression model
predictions_gamevsalary_wnba = model_gamevsalary_wnba.predict(X) #run model
model_gamevsalary_wnba.summary() #print results 




In [18]:
gamesvsalary_wnba = px.scatter(df_wnba_general, x = "games",
                            y = "salary",
                            title = "Salary vs Games Played for Women's Basketball")
gamesvsalary_wnba

In [19]:

gamesvsalary_wnba_Box = px.box(df_wnba_general, x = "games",
                            y = "salary",
                            title = "Salary vs Games Played for Women's Basketball")
gamesvsalary_wnba_Box


In [None]:
#predict mens salary through games played, assists, points, rebounds

X = df_mnba_general[["games",'rebds.','points','assists']] #initialize independent variables
y = df_mnba_general["salary"] #initialize dependent variable
model_gamevsalary_nba = sm.OLS(y, X).fit() #define least of squares regression model
predictions_gamevsalary_nba = model_gamevsalary_nba.predict(X) #run model
model_gamevsalary_nba.summary() #print results 


In [None]:
#predict womens salary through games played, assists, points, rebounds

X = df_wnba_general[["games",'rebds.','points','assists']] #initialize independent variables
y = df_wnba_general["salary"] #initialize dependent variable
model_gamevsalary_wnba = sm.OLS(y, X).fit() #define least of squares regression model
predictions_gamevsalary_wnba = model_gamevsalary_wnba.predict(X) #run model
model_gamevsalary_wnba.summary() #print results 
