In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
salaries = pd.read_excel('salaries_1985to2020_final.xlsx')
p_stats = pd.read_excel('Complete_Train_Set_With_Cap_Usage.xlsx')
stats_2020 = pd.read_excel('Clean_2019_2020_Data.xlsx')

In [4]:
salary_caps = pd.read_excel('salary_caps.xlsx')

In [5]:
salaries = salaries.rename(columns={'names':'name'})
stats_2020 = stats_2020.merge(salaries.loc[salaries['season_end']==2020], on=['name'])
salary_cap_2020 = salary_caps[salary_caps['season']=='2019-20']['salary_cap']
individual_salaries = stats_2020['salary']
cap_usage = individual_salaries/int(salary_cap_2020)
stats_2020['cap_usage'] = cap_usage

In [7]:
def extract_numerical_data(df):
    columns_to_remove = []
    for feature in df.columns.values:
        if type(df.iloc[0][feature])==str:
            columns_to_remove.append(feature)
    return df.drop(columns=columns_to_remove)

def clean_datasets(p_stats, stats_2020, filter_year):
    p_stats = p_stats.dropna()
    stats_2020 = stats_2020.dropna()
    p_stats = p_stats.loc[p_stats['season_end'] >= filter_year]
    stats_2020 = stats_2020.loc[stats_2020['season_end'] >= filter_year]
    X_train = extract_numerical_data(p_stats)
    X_test = extract_numerical_data(stats_2020)
    X_train = X_train.drop(columns=['Unnamed: 0', 'salary'])
    X_test = X_test.drop(columns=['Unnamed: 0', 'salary'])
    X_train = X_train.dropna()
    X_test = X_test.dropna()
    X_train = X_train.loc[X_train['season_end'] >= filter_year]
    X_test = X_test.loc[X_test['season_end'] >= filter_year]
    y_train = X_train[['cap_usage']]
    y_test = X_test[['cap_usage']]
    X_train = X_train.drop(columns=['cap_usage'])
    X_test = X_test.drop(columns=['cap_usage'])
    X_train = X_train.drop(columns=['salary_cap_x', 'salary_cap_y'])
    print("is the cleaning is successful?:", X_train.columns.values==X_test.columns.values)
    return X_train, X_test, y_train, y_test, p_stats, stats_2020



In [8]:
X_train, X_test, y_train, y_test, p_stats, stats_2020 = clean_datasets(p_stats, stats_2020, 2015)

is the cleaning is successful?: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]


In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge

In [15]:
features = ['points', 'made_field_goals', 'minutes_played', 'defensive_rebounds', 'win_shares', 'value_over_replacement_player', 'turnovers']
X_train_new = X_train[features]
X_test_new = X_test[features]

In [17]:
def model_performance(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    print("score for the training set: ", model.score(X_train, y_train))
    print("score for the test set: ", model.score(X_test, y_test))
    print("mean squared error for the training set: ", mean_squared_error(model.predict(X_train), y_train))
    print("mean squared error for the test set: ", mean_squared_error(model.predict(X_test), y_test))

## model building starts here

#### model 1: boosting

In [39]:
import sklearn.ensemble as ske

In [19]:
from sklearn.ensemble import AdaBoostRegressor

In [42]:
model = AdaBoostRegressor(base_estimator = ske.RandomForestRegressor(), n_estimators=5)
model_performance(model, X_train_new, y_train, X_test_new, y_test)

  return f(**kwargs)


score for the training set:  0.8618448849280507
score for the test set:  0.4156135586541623
mean squared error for the training set:  0.0007486531695624428
mean squared error for the test set:  0.0034437716246513653


In [25]:
test_with_name = pd.concat([p_stats[['name']], X_train_new], axis=1)
test_with_name = pd.concat([stats_2020[['name']], X_test_new], axis=1)

In [30]:
def make_dataframe_of_results(test_df, y_test, model, features):
    #pass in a fitted model
    prediction = pd.DataFrame(columns=['name', 'prediction', 'actual'])
    prediction['name'] = test_df['name']
    prediction['prediction'] = model.predict(test_df[features])
    prediction['actual'] = y_test
    return prediction

In [38]:
prediction1 = make_dataframe_of_results(test_with_name, y_test, model, features)
prediction1.loc[prediction1['name']=='Mike Conley']

Unnamed: 0,name,prediction,actual
139,Mike Conley,0.113835,0.297889


In [37]:
model_linear = LinearRegression()
model_linear.fit(train_with_name[features], y_train)
prediction_linear = make_dataframe_of_results(test_with_name, y_test, model_linear, features)
prediction_linear.loc[prediction_linear['name']=='Mike Conley']

Unnamed: 0,name,prediction,actual
139,Mike Conley,0.094374,0.297889


#### model 2: random forest regressor

In [52]:

model_rf = ske.RandomForestRegressor(max_depth = 5, min_samples_split=3, min_samples_leaf=1)
model_rf.fit(train_with_name[features], y_train)
prediction_rf = make_dataframe_of_results(test_with_name, y_test, model_rf, features)
model_performance(model_rf, train_with_name[features], y_train, test_with_name[features], y_test)

  model_rf.fit(train_with_name[features], y_train)
  model.fit(X_train, y_train)


score for the training set:  0.5512960913636552
score for the test set:  0.43932254851400165
mean squared error for the training set:  0.002431495954534233
mean squared error for the test set:  0.0033040552644626755


In [53]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#stadnardiztion
#featuesecltion
#training
#validation
#test
#predict

In [54]:
pipe = Pipeline([('regressor', ske.RandomForestRegressor())])
samples_split = np.arange(2, 5)
samples_leaf = np.arange(1, 4)

params = [{'regressor__min_samples_split': samples_split, 'regressor__min_samples_leaf':samples_leaf, }]

gridsearch = GridSearchCV(pipe, params, verbose=1).fit(train_with_name[features], np.ravel(y_train))
print('The best score is: ', gridsearch.best_score_)
print('The best parameters are: ', gridsearch.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  1.3min finished


The best score is:  0.4433892261648409
The best parameters are:  {'regressor__min_samples_leaf': 3, 'regressor__min_samples_split': 3}


In [60]:
pipe = Pipeline([('regressor', ske.RandomForestRegressor())])
samples_split = np.arange(2, 5)
samples_leaf = np.arange(1, 4)
                 
params = [{'regressor__min_samples_split': samples_split, 'regressor__min_samples_leaf': samples_leaf, 'regressor__max_features':np.arange(1, 8), 'regressor__max_depth':np.arange(2, 8), 'regressor__n_estimators': np.arange(10, 200, 20)}]

gridsearch = GridSearchCV(pipe, params, verbose=1).fit(train_with_name[features], np.ravel(y_train))
print('The best score is: ', gridsearch.best_score_)
print('The best parameters are: ', gridsearch.best_params_)

Fitting 5 folds for each of 3780 candidates, totalling 18900 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
pipe = Pipeline([('regressor', ske.RandomForestRegressor())])
samples_split = np.arange(2, 5)
samples_leaf = np.arange(1, 4)
                 
params = [{'regressor__min_samples_split': [3], 'regressor__min_samples_leaf': [3], 'regressor__max_features':np.arange(1, 8, 2), 'regressor__max_depth':np.arange(3, 6), 'regressor__n_estimators': np.arange(10, 200, 40)}]

gridsearch = GridSearchCV(pipe, params, verbose=1).fit(train_with_name[features], np.ravel(y_train))
print('The best score is: ', gridsearch.best_score_)
print('The best parameters are: ', gridsearch.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
