In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pandas as pd
pd.options.mode.chained_assignment = None

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 50)

In [2]:
salary_file = pd.ExcelFile('../contract_data/salaries_1985to2020_final.xlsx')
salaries = pd.read_excel(salary_file, 0)
players_info = pd.read_excel(salary_file, 1)

In [3]:
### add years of the contract
years = list(salaries.groupby('player_id').apply(lambda x: [i for i in range(1, len(x)+1)]))
flatten = lambda l: [item for sublist in l for item in sublist]
salaries['years'] = flatten(years)

In [4]:
salaries_98_20 = salaries.loc[salaries['season_end'] >= 1998]
unique_ids = list(pd.unique(salaries_98_20['player_id']))
filtered_players_info = players_info.loc[[True if pid in unique_ids else False for pid in players_info['_id']]]

In [5]:
# Deal with rookie contracts of first round picks and second round picks
# separately. 
first_round_picks = filtered_players_info.loc[filtered_players_info['draft_round'] == '1st round']
second_round_picks = filtered_players_info.loc[filtered_players_info['draft_round'] == '2nd round']

In [6]:
salaries_first_round = salaries_98_20.loc[[True if pid in list(first_round_picks['_id']) else False for pid in salaries_98_20['player_id']]].drop(columns=['team'])
# Drop entries with 'years' being 4 or less
# Prior to 2011 CBA, first round rookie contract is a 3+1 contract with the fourth year being team option
# After 2011 CBA, first round rookie contract is a 2+2 contract with both the 3rd and 4th year being team options
# Dropping values will potentially over-dropping some datapoints
salaries_first_round = salaries_first_round.loc[salaries_first_round['years'] > 4]

salaries_second_round = salaries_98_20.loc[[True if pid in list(second_round_picks['_id']) else False for pid in salaries_98_20['player_id']]].drop(columns=['team'])
# Drop entries with 'years' being 3 or less
salaries_second_round = salaries_second_round.loc[salaries_second_round['years'] > 3]

In [None]:
# Following part is for fixing data in all stats

In [7]:
stats = pd.read_excel('../season_data/advanced_stats/FINAL_ADVANCED_STATS/all_seasons_advanced_stats.xlsx')

In [8]:
stats['error_in_name'] = stats['slug'].apply(lambda x: x is np.nan)
stats_need_fix = stats.loc[stats['error_in_name'] == True]

In [9]:
stats_need_fix['Year'] = '2019-20'
name = stats_need_fix['name'].apply(lambda x: re.sub(r"\\.+", "", x))
slug = stats_need_fix['name'].apply(lambda x: re.sub(r".+\\", "", x))
stats_need_fix['name'], stats_need_fix['slug'] = name, slug

In [10]:
stats = stats.loc[stats['error_in_name'] == False].append(stats_need_fix).drop(columns=['error_in_name'])

In [11]:
# Create a primary key in two salaries table for joins
salaries_first_round['key'] = salaries_first_round['player_id'] + " " + salaries_first_round['season']
salaries_second_round['key'] = salaries_second_round['player_id'] + " " + salaries_second_round['season']
stats['key'] = stats['slug'] + " " + stats['Year']

In [12]:
salaries_stats_first_round = salaries_first_round.join(stats.set_index('key'), on='key', how='left').dropna()
salaries_stats_second_round = salaries_second_round.join(stats.set_index('key'), on='key', how='left').dropna()

In [13]:
first_round_counts = salaries_stats_first_round[['names', 'key']].groupby('key').count()
second_round_counts = salaries_stats_second_round[['names', 'key']].groupby('key').count()
not_traded_mid_season_first_round = list(first_round_counts.loc[first_round_counts['names'] == 1].index)
not_traded_mid_season_second_round = list(second_round_counts.loc[second_round_counts['names'] == 1].index)

In [14]:
salaries_stats_first_round = salaries_stats_first_round.reset_index().set_index('key').loc[not_traded_mid_season_first_round].reset_index().set_index('index')
salaries_stats_second_round = salaries_stats_second_round.reset_index().set_index('key').loc[not_traded_mid_season_second_round].reset_index().set_index('index')

In [15]:
salary_caps_file = pd.ExcelFile('salary_caps.xlsx')
salary_caps = pd.read_excel(salary_caps_file, 0)
salary_caps = salary_caps.drop(columns=salary_caps.columns[0])
salary_caps_99_20 = salary_caps.set_index('season').loc[np.unique(salaries_stats_first_round['season'])]

In [16]:
salaries_stats_first_round = salaries_stats_first_round.join(salary_caps_99_20, on='season', how='left')
salaries_stats_first_round = salaries_stats_first_round[salaries_stats_first_round['names'].notna()]
salaries_stats_second_round = salaries_stats_second_round.join(salary_caps_99_20, on='season', how='left')
salaries_stats_second_round = salaries_stats_second_round[salaries_stats_second_round['names'].notna()]

In [17]:
salaries_stats_first_round['cap_space_usage'] = salaries_stats_first_round['salary'] / salaries_stats_first_round['salary_cap']
salaries_stats_second_round['cap_space_usage'] = salaries_stats_second_round['salary'] / salaries_stats_second_round['salary_cap']
salaries_stats_first_round['minutes_per_game'] = salaries_stats_first_round['minutes_played'] / salaries_stats_first_round['games_played']
salaries_stats_second_round['minutes_per_game'] = salaries_stats_second_round['minutes_played'] / salaries_stats_second_round['games_played']
salaries_stats_first_round['ast/tov_percentage'] = salaries_stats_first_round['assist_percentage'] / salaries_stats_first_round['turnover_percentage']
salaries_stats_second_round['ast/tov_percentage'] = salaries_stats_second_round['assist_percentage'] / salaries_stats_second_round['turnover_percentage']
salaries_stats_first_round['true_shooting*usage'] = salaries_stats_first_round['true_shooting_percentage'] * salaries_stats_first_round['usage_percentage']
salaries_stats_second_round['true_shooting*usage'] = salaries_stats_second_round['true_shooting_percentage'] * salaries_stats_second_round['usage_percentage']

In [18]:
salaries_stats_first_round['ast/tov_percentage'] = salaries_stats_first_round['ast/tov_percentage'].replace([0, np.inf], np.nan)
salaries_stats_second_round['ast/tov_percentage'] = salaries_stats_second_round['ast/tov_percentage'].replace([0, np.inf], np.nan)
salaries_stats_first_round['true_shooting*usage'] = salaries_stats_first_round['true_shooting*usage'].replace([0], np.nan)
salaries_stats_second_round['true_shooting*usage'] = salaries_stats_second_round['true_shooting*usage'].replace([0], np.nan)

salaries_stats_first_round = salaries_stats_first_round.dropna()
salaries_stats_second_round = salaries_stats_second_round.dropna()

In [19]:
salaries_stats_first_round = salaries_stats_first_round.drop(columns=['player_id', 'season', 'season_end',
                                        'name', 'team',
                                        'slug', 'is_combined_totals', 'Year'])
salaries_stats_second_round = salaries_stats_second_round.drop(columns=['player_id', 'season', 'season_end',
                                        'name', 'team',
                                        'slug', 'is_combined_totals', 'Year'])

In [20]:
salaries_stats_first_round.columns

Index(['key', 'names', 'salary', 'season_start', 'years', 'positions', 'age',
       'games_played', 'minutes_played', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player', 'salary_cap',
       'cap_space_usage', 'minutes_per_game', 'ast/tov_percentage',
       'true_shooting*usage'],
      dtype='object')

## Rewards and Punishments

In [21]:
def find_percentile_for_feature(data, feature, percentile, position):
    return np.percentile(data.loc[(data['positions'] == position)][feature], percentile)

types = ['total_rebound_percentage',
        'assist_percentage',
        'three_point_attempt_rate',
        'block_percentage',
        'defensive_box_plus_minus',
        'defensive_win_shares']

### To use the following function to obtain players rewards and punishments
### iterate through players and store them in an array to form an array of arrays
### Sum and divided by 6, then add them to the predicted salary cap usage.


def archtype_rewards_punishments(dataset, data, upper_p, lower_p, archtypes):
    # data: a Series containing data of a player
    # upper_p: upper percentile for specific statline
    # lower_p: lower percentile for specific statline
    # types: archtypes
    # return a list of weights
    pos = data.positions
    weights = []
    position_data = dataset.loc[dataset['positions'] == pos]
    cap_usage_mean = position_data['cap_space_usage'].mean()
    
    for archtype in archtypes:
        upper_value = find_percentile_for_feature(dataset, archtype, upper_p, pos)
        lower_value = find_percentile_for_feature(dataset, archtype, lower_p, pos)
        if data[archtype] >= upper_value:
            upper_mean = position_data.loc[position_data[archtype] >= upper_value]['cap_space_usage'].mean()
            weights.append(upper_mean - cap_usage_mean)
        elif data[archtype] <= lower_value:
            lower_mean = position_data.loc[position_data[archtype] <= lower_value]['cap_space_usage'].mean()
            weights.append(cap_usage_mean - lower_mean)
        else:
            weights.append(0)
                
    return weights

## Prediction

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

### Second Rounders Start here

In [24]:
seed = 123

test = salaries_stats_second_round.loc[salaries_stats_second_round['season_start'] == 2019]
test_x = test[['age', 'player_efficiency_rating',
       'true_shooting*usage', 'offensive_box_plus_minus', 
       'value_over_replacement_player', 'ast/tov_percentage', 'offensive_win_shares']]
test_y = test[['cap_space_usage']]

train = salaries_stats_second_round.loc[salaries_stats_second_round['season_start'] != 2019]
train = shuffle(train, random_state = seed)
train_x = train[['age', 'player_efficiency_rating',
       'true_shooting*usage', 'offensive_box_plus_minus', 
       'value_over_replacement_player', 'ast/tov_percentage', 'offensive_win_shares']]
train_y = train[['cap_space_usage']]

#### Train and Validation

In [25]:
train_length = int(len(train))
train_size = int(train_length*0.78)

t = train[:train_size]
t_x, t_y = train_x[:train_size], train_y[:train_size]

v = train[train_size:]
v_x, v_y = train_x[train_size:], train_y[train_size:]

model = LinearRegression(n_jobs=-1).fit(t_x, t_y)

prediction = np.ndarray.flatten(model.predict(v_x))

p_r = []
for i in v.index:
    p_r.append(sum(archtype_rewards_punishments(v, v.loc[i], 90, 10, types))/6)

predicted_salary = np.rint((prediction+p_r)*v['salary_cap'].values)

print(mean_absolute_error(v['salary'].values, predicted_salary))
print(mean_absolute_error(v['cap_space_usage'].values, prediction))

v['predicted_salary'] = predicted_salary
v['predicted_cap_usage'] = prediction
v['differential'] = np.abs(v['cap_space_usage'] - v['predicted_cap_usage'])
result = v[['names', 'salary', 'predicted_salary', 'cap_space_usage', 'predicted_cap_usage','differential', 'season_start']]

print(mean_absolute_error(result.loc[result['cap_space_usage'] >= 0.05]['predicted_cap_usage'].values, 
                          result.loc[result['cap_space_usage'] >= 0.05]['cap_space_usage'].values))

2735399.0
0.04685691779437234
0.045356165987952106


In [26]:
result.sort_values('differential', ascending=False).head(150)

Unnamed: 0_level_0,names,salary,predicted_salary,cap_space_usage,predicted_cap_usage,differential,season_start
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11361,Michael Redd,18300000,3429243.0,0.315278,0.074034,0.241244,2010
8092,Rashard Lewis,18876000,8259949.0,0.32714,0.119289,0.207851,2009
11360,Michael Redd,17040000,5084154.0,0.295321,0.088114,0.207207,2009
9373,Paul Millsap,31269231,11405467.0,0.315554,0.115099,0.200456,2017
3243,Antonio Davis,12000000,4089509.0,0.297981,0.097539,0.200442,2002
10565,Chandler Parsons,22116750,4975715.0,0.234927,0.052853,0.182074,2016
13290,Isaiah Thomas,6587132,23090703.0,0.069969,0.236472,0.166503,2016
14405,Hassan Whiteside,981348,14654042.0,0.014019,0.174617,0.160598,2015
4804,Marc Gasol,19700000,10949749.0,0.281429,0.137336,0.144092,2015
12388,Bobby Simmons,10560000,3582403.0,0.183016,0.05483,0.128185,2009


#### Used 2019-2020 as test set

In [27]:
p_r = []
for i in test.index:
    p_r.append(sum(archtype_rewards_punishments(test, test.loc[i], 90, 10, types))/6)

prediction = np.ndarray.flatten(model.predict(test_x))
predicted_salary = np.rint((prediction+p_r)*test['salary_cap'].values)

print(mean_absolute_error(test['salary'].values, predicted_salary))
print(mean_absolute_error(test['cap_space_usage'].values, prediction))

test['predicted_salary'] = predicted_salary
test['predicted_cap_usage'] = prediction
test['differential'] = np.abs(test['cap_space_usage'] - test['predicted_cap_usage'])
result = test[['names', 'salary', 'predicted_salary', 'cap_space_usage', 
               'predicted_cap_usage', 'differential','season_start']]

print(mean_absolute_error(result.loc[result['cap_space_usage'] >= 0.05]['predicted_cap_usage'].values, 
                          result.loc[result['cap_space_usage'] >= 0.05]['cap_space_usage'].values))

5600923.658536585
0.049837632685052695
0.04923943297674881


In [28]:
result.sort_values('differential', ascending=False).head(150)

Unnamed: 0_level_0,names,salary,predicted_salary,cap_space_usage,predicted_cap_usage,differential,season_start
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10568,Chandler Parsons,25102511,5707031.0,0.230003,0.052291,0.177712,2019
9375,Paul Millsap,30500000,9505894.0,0.279458,0.117401,0.162057,2019
4808,Marc Gasol,25595700,12334999.0,0.234522,0.102411,0.13211,2019
9199,Khris Middleton,30603448,21375570.0,0.280405,0.16724,0.113165,2019
5679,Montrezl Harrell,6000000,16550883.0,0.054975,0.151648,0.096673,2019
14409,Hassan Whiteside,27093018,21703520.0,0.248241,0.163318,0.084923,2019
5315,Draymond Green,18539130,12059897.0,0.169866,0.086424,0.083441,2019
6001,Willy HernangÃ³mez,1676735,12597490.0,0.015363,0.094934,0.079571,2019
10335,Kyle O'Quinn,2028594,12308943.0,0.018587,0.094979,0.076391,2019
14776,Lou Williams,8000000,15845105.0,0.0733,0.140274,0.066974,2019


### First Rounders Start Here

In [29]:
seed = 123

test = salaries_stats_first_round.loc[salaries_stats_first_round['season_start'] == 2019]
test_x = test[['age', 'player_efficiency_rating',
       'true_shooting*usage', 'offensive_box_plus_minus', 
       'value_over_replacement_player', 'ast/tov_percentage', 'offensive_win_shares']]
test_y = test[['cap_space_usage']]

train = salaries_stats_first_round.loc[salaries_stats_first_round['season_start'] != 2019]
train = shuffle(train, random_state = seed)
train_x = train[['age', 'player_efficiency_rating',
       'true_shooting*usage', 'offensive_box_plus_minus', 
       'value_over_replacement_player', 'ast/tov_percentage', 'offensive_win_shares']]
train_y = train[['cap_space_usage']]

#### Train and Validation

In [30]:
train_length = int(len(train))
train_size = int(train_length*0.78)

t = train[:train_size]
t_x, t_y = train_x[:train_size], train_y[:train_size]

v = train[train_size:]
v_x, v_y = train_x[train_size:], train_y[train_size:]

model = LinearRegression(n_jobs=-1).fit(t_x, t_y)
prediction = np.ndarray.flatten(model.predict(v_x))

p_r = []
for i in v.index:
    p_r.append(sum(archtype_rewards_punishments(v, v.loc[i], 90, 10, types))/6)

predicted_salary = np.rint((prediction+p_r)*v['salary_cap'].values)

print(mean_absolute_error(v['salary'].values, predicted_salary))
print(mean_absolute_error(v['cap_space_usage'].values, prediction))

v['predicted_salary'] = predicted_salary
v['predicted_cap_usage'] = prediction
v['differential'] = np.abs(v['cap_space_usage'] - v['predicted_cap_usage'])
result = v[['names', 'salary', 'predicted_salary', 'cap_space_usage', 'predicted_cap_usage','differential', 'season_start']]

print(mean_absolute_error(result.loc[result['cap_space_usage'] >= 0.05]['predicted_cap_usage'].values, 
                          result.loc[result['cap_space_usage'] >= 0.05]['cap_space_usage'].values))

3411437.772036474
0.06092548680505809
0.05789978515782391


In [31]:
result.sort_values('differential', ascending=False).head(150)

Unnamed: 0_level_0,names,salary,predicted_salary,cap_space_usage,predicted_cap_usage,differential,season_start
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8555,Stephon Marbury,20840625,2993686.0,0.355157,0.042587,0.31257,2008
10314,Shaquille O'Neal,24749999,12815913.0,0.564553,0.275263,0.28929,2003
6364,Juwan Howard,13125000,4744307.0,0.4375,0.149169,0.288331,1998
5588,Anfernee Hardaway,15750000,2476184.0,0.318182,0.033211,0.284971,2005
4756,Kevin Garnett,19610000,11751766.0,0.552394,0.293144,0.25925,2000
10121,Charles Oakley,10186000,3002645.0,0.339533,0.082663,0.25687,1998
10233,Hakeem Olajuwon,16700000,7680468.0,0.470423,0.216351,0.254071,2000
3126,Stephen Curry,11370786,30149203.0,0.16244,0.407204,0.244764,2015
9636,Alonzo Mourning,16880000,8191820.0,0.475493,0.233071,0.242422,2000
11680,David Robinson,14841000,8250224.0,0.4947,0.267759,0.226941,1998


#### Used 2019-2020 as test set

In [32]:
p_r = []
for i in test.index:
    p_r.append(sum(archtype_rewards_punishments(test, test.loc[i], 90, 10, types))/6)

prediction = np.ndarray.flatten(model.predict(test_x))
predicted_salary = np.rint((prediction+p_r)*test['salary_cap'].values)

print(mean_absolute_error(test['salary'].values, predicted_salary))
print(mean_absolute_error(test['cap_space_usage'].values, prediction))

test['predicted_salary'] = predicted_salary
test['predicted_cap_usage'] = prediction
test[['names', 'salary', 'predicted_salary', 'cap_space_usage', 'predicted_cap_usage', 'season_start']]

6645443.241666666
0.05906706989784573


Unnamed: 0_level_0,names,salary,predicted_salary,cap_space_usage,predicted_cap_usage,season_start
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
81,Steven Adams,25842697,19767762.0,0.236785,0.181123,2019
169,LaMarcus Aldridge,26000000,20771178.0,0.238226,0.190875,2019
290,Al-Farouq Aminu,9258000,8496886.0,0.084827,0.062771,2019
384,Kyle Anderson,9073050,12074096.0,0.083132,0.102904,2019
455,Giannis Antetokounmpo,25842697,46986747.0,0.236785,0.39094,2019
474,Carmelo Anthony,2159029,13427401.0,0.019782,0.109876,2019
670,D.J. Augustin,7250000,17812728.0,0.066428,0.107104,2019
862,Harrison Barnes,24147727,9226934.0,0.221255,0.099927,2019
1016,Nicolas Batum,25565217,6927667.0,0.234242,0.059827,2019
1057,Bradley Beal,27093019,26858007.0,0.248241,0.229978,2019


### Below is work for feature engineering

In [None]:
pg_second_round = salaries_stats_second_round.loc[salaries_stats_second_round['positions'] == "POINT GUARD"].dropna()
sg_second_round = salaries_stats_second_round.loc[salaries_stats_second_round['positions'] == "SHOOTING GUARD"].dropna()
sf_second_round = salaries_stats_second_round.loc[salaries_stats_second_round['positions'] == "SMALL FORWARD"].dropna()
pf_second_round = salaries_stats_second_round.loc[salaries_stats_second_round['positions'] == "POWER FORWARD"].dropna()
c_second_round = salaries_stats_second_round.loc[salaries_stats_second_round['positions'] == "CENTER"].dropna()

In [None]:
pg_first_round = salaries_stats_first_round.loc[salaries_stats_first_round['positions'] == "POINT GUARD"].dropna()
sg_first_round = salaries_stats_first_round.loc[salaries_stats_first_round['positions'] == "SHOOTING GUARD"].dropna()
sf_first_round = salaries_stats_first_round.loc[salaries_stats_first_round['positions'] == "SMALL FORWARD"].dropna()
pf_first_round = salaries_stats_first_round.loc[salaries_stats_first_round['positions'] == "POWER FORWARD"].dropna()
c_first_round = salaries_stats_first_round.loc[salaries_stats_first_round['positions'] == "CENTER"].dropna()

In [None]:
# Example 1
sns.distplot(salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == "CENTER") & (salaries_stats_first_round['minutes_played'] >= 700)]['assist_percentage'], bins=np.arange(0, 1, 0.1))
p = find_percentile_for_feature(salaries_stats_first_round, 
                                 'assist_percentage', 80, 'CENTER')
print(p)
c = salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == "CENTER") & 
                               (salaries_stats_first_round['minutes_played'] >= 700) & 
                               (salaries_stats_first_round['assist_percentage'] >= p)]
c['cap_space_usage'].mean()

In [None]:
# Example 2
sns.distplot(salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == "SHOOTING GUARD") &
                                                (salaries_stats_first_round['minutes_played'] >= 700)]['three_point_attempt_rate'],
                                                 bins=np.arange(0, 1, 0.1))
p = find_percentile_for_feature(salaries_stats_first_round, 
                                 'three_point_attempt_rate', 15, 'SHOOTING GUARD')
print(p)
sg = salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == "SHOOTING GUARD") & 
                               (salaries_stats_first_round['minutes_played'] >= 700) & 
                               (salaries_stats_first_round['three_point_attempt_rate'] <= p)]
sg['cap_space_usage'].mean()

In [None]:
# Example 3
pos = 'SMALL FORWARD'

sns.distplot(salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == pos) &
                                                (salaries_stats_first_round['minutes_played'] >= 700)]['three_point_attempt_rate'],
                                                 bins=np.arange(0, 1, 0.1))
p = find_percentile_for_feature(salaries_stats_first_round, 
                                 'three_point_attempt_rate', 15, pos)
print(p)
sf = salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == pos) & 
                               (salaries_stats_first_round['minutes_played'] >= 700) & 
                               (salaries_stats_first_round['three_point_attempt_rate'] <= p)]
sf['cap_space_usage'].mean()

In [None]:
# Example 4

p = find_percentile_for_feature(salaries_stats_first_round, 
                                 'defensive_rebound_percentage', 80, 'POWER FORWARD')
print(p)
pf = salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == "POWER FORWARD") & 
                               (salaries_stats_first_round['minutes_played'] >= 700) & 
                               (salaries_stats_first_round['defensive_rebound_percentage'] >= p)]
sns.distplot(pf['cap_space_usage'], bins=np.arange(0, 1, 0.1))

In [None]:
# Example 4

p = find_percentile_for_feature(salaries_stats_first_round, 
                                 'steal_percentage', 80, 'POINT GUARD')
print(p)
pg = salaries_stats_first_round.loc[(salaries_stats_first_round['positions'] == 'POINT GUARD') & 
                               (salaries_stats_first_round['minutes_played'] >= 700) & 
                               (salaries_stats_first_round['steal_percentage'] >= p)]
sns.distplot(pg['cap_space_usage'], bins=np.arange(0, 1, 0.1))
pg['cap_space_usage'].mean()

In [None]:
def standardize(df, column):
    series = []
    for i in df[column]:
        series.append((i - df[column].mean())/df[column].std())
    return series

def correlation(df, column_1, column_2):
    return np.mean(df[column_1] * df[column_2])

def scatter_and_correlation(df, column_1, column_2, ax, order):
    copy = df.copy()
    copy[column_1 + "_standard"] = standardize(df, column_1)
    copy[column_2 + "_standard"] = standardize(df, column_2)
    copy.plot.scatter(x = column_1 + "_standard", y = column_2 + '_standard', ax=ax[order])
    return correlation(copy, column_1 + '_standard', column_2 + '_standard')

In [None]:
def side_by_side_correlation(filtered_data, unfiltered_data, feature):
    fig, ax = plt.subplots(1,2,figsize=(10, 5))
    corr_filtered = scatter_and_correlation(filtered_data, feature, 'cap_space_usage', ax, 0)
    corr_unfiltered = scatter_and_correlation(unfiltered_data, feature, 'cap_space_usage', ax, 1)
    return (corr_filtered, corr_unfiltered, feature)

In [None]:
# Blocks below are for visualizations of unified 6 features

# No need to run them

In [None]:
features = ['player_efficiency_rating',
              'ast/tov_percentage', 'true_shooting*usage', 'offensive_box_plus_minus', 'value_over_replacement_player', 'win_shares']

In [None]:
correlations = {"C":{}, "PF":{}, "SF":{}, "SG":{}, "PG":{}}

In [None]:
for feature in features:
    filtered, unfiltered, feature = side_by_side_correlation(c_first_round.loc[c_first_round["minutes_played"] >= 700], c_first_round, feature)
    print(filtered, unfiltered, feature)
    correlations["C"][feature] = filtered
    
for feature in features:
    filtered, unfiltered, feature = side_by_side_correlation(pf_first_round.loc[pf_first_round["minutes_played"] >= 700], pf_first_round, feature)
    correlations["PF"][feature] = filtered
    
for feature in features:
    filtered, unfiltered, feature = side_by_side_correlation(sf_first_round.loc[sf_first_round["minutes_played"] >= 700], sf_first_round, feature)
    correlations["SF"][feature] = filtered
    
for feature in features:
    filtered, unfiltered, feature = side_by_side_correlation(sg_first_round.loc[sg_first_round["minutes_played"] >= 700], sg_first_round, feature)
    print(filtered, unfiltered, feature)
    correlations["SG"][feature] = filtered

for feature in features:
    filtered, unfiltered, feature = side_by_side_correlation(pg_first_round.loc[pg_first_round["minutes_played"] >= 700], pg_first_round, feature)
    correlations["PG"][feature] = filtered

In [None]:
df = pd.DataFrame.from_dict({(i,j): correlations[i][j] 
                            for i in correlations.keys() 
                            for j in correlations[i].keys()},
                            orient='index')
df.index = pd.MultiIndex.from_tuples(df.index)
df = df.reset_index()
df = df.rename(columns={'level_0': "Position", "level_1": "Feature", 0: "Correlation with cap space usage"})

In [None]:
for feature in features:
    print(side_by_side_correlation(c_second_round.loc[c_second_round["minutes_played"] >= 350], c_second_round, feature))

for feature in features:
    print(side_by_side_correlation(pf_second_round.loc[pf_second_round["minutes_played"] >= 350], pf_second_round, feature))

for feature in features:
    print(side_by_side_correlation(sf_second_round.loc[sf_second_round["minutes_played"] >= 350], sf_second_round, feature))

for feature in features:
    print(side_by_side_correlation(sg_second_round.loc[sg_second_round["minutes_played"] >= 350], sg_second_round, feature))

for feature in features:
    print(side_by_side_correlation(pg_second_round.loc[pg_second_round["minutes_played"] >= 350], pg_second_round, feature))