In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
salaries = pd.read_excel('salaries_1985to2020_final.xlsx')
p_stats = pd.read_excel('Complete_Train_Set_With_Cap_Usage.xlsx')
stats_2020 = pd.read_excel('Clean_2019_2020_Data.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'salaries_1985to2020_final.xlsx'

In [None]:
salary_caps = pd.read_excel('salary_caps.xlsx')

In [None]:
salaries.loc[salaries['season_end']==2020]

In [None]:
salaries = salaries.rename(columns={'names':'name'})

In [None]:
stats_2020 = stats_2020.merge(salaries.loc[salaries['season_end']==2020], on=['name'])

In [None]:
stats_2020.head()

In [None]:
salary_cap_2020 = salary_caps[salary_caps['season']=='2019-20']['salary_cap']
int(salary_cap_2020)

In [None]:
individual_salaries = stats_2020['salary']
cap_usage = individual_salaries/int(salary_cap_2020)
stats_2020['cap_usage'] = cap_usage

In [None]:
def extract_numerical_data(df):
    columns_to_remove = []
    for feature in df.columns.values:
        if type(df.iloc[0][feature])==str:
            columns_to_remove.append(feature)
    return df.drop(columns=columns_to_remove)

In [None]:
def clean_datasets(p_stats, stats_2020, filter_year):
    p_stats = p_stats.dropna()
    stats_2020 = stats_2020.dropna()
    p_stats = p_stats.loc[p_stats['season_end'] >= filter_year]
    stats_2020 = stats_2020.loc[stats_2020['season_end'] >= filter_year]
    X_train = extract_numerical_data(p_stats)
    X_test = extract_numerical_data(stats_2020)
    X_train = X_train.drop(columns=['Unnamed: 0', 'salary'])
    X_test = X_test.drop(columns=['Unnamed: 0', 'salary'])
    X_train = X_train.dropna()
    X_test = X_test.dropna()
    X_train = X_train.loc[X_train['season_end'] >= filter_year]
    X_test = X_test.loc[X_test['season_end'] >= filter_year]
    y_train = X_train[['cap_usage']]
    y_test = X_test[['cap_usage']]
    X_train = X_train.drop(columns=['cap_usage'])
    X_test = X_test.drop(columns=['cap_usage'])
    X_train = X_train.drop(columns=['salary_cap_x', 'salary_cap_y'])
    print("is the cleaning is successful?:", X_train.columns.values==X_test.columns.values)
    return X_train, X_test, y_train, y_test, p_stats, stats_2020

X_train, X_test, y_train, y_test, p_stats, stats_2020 = clean_datasets(p_stats, stats_2020, 2015)

# model building starts here


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge

In [None]:
def identify_features_linear(X, y):
    rankings_df = pd.DataFrame(columns=['feature', 'r_squared_value'])
    for feature in X.columns.values:
        model = Ridge().fit(X[[feature]], y)
        rankings_df = rankings_df.append({'feature': feature, 'r_squared_value': model.score(X[[feature]], y)}, ignore_index=True)
    return rankings_df.sort_values(by=['r_squared_value'])

In [None]:
identify_features_linear(X_train, y_train)

In [None]:
features = ['points', 'made_field_goals', 'minutes_played', 'defensive_rebounds', 'win_shares', 'value_over_replacement_player', 'turnovers']
X_train_new = X_train[features]
X_test_new = X_test[features]

In [None]:
def model_performance(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    print("score for the training set: ", model.score(X_train, y_train))
    print("score for the test set: ", model.score(X_test, y_test))
    print("mean squared error for the training set: ", mean_squared_error(model.predict(X_train), y_train))
    print("mean squared error for the test set: ", mean_squared_error(model.predict(X_test), y_test))

# linear regression model 1

In [None]:
model = LinearRegression()
model_performance(model, X_train_new, y_train, X_test_new, y_test)

# linear regression model 2 -- with boosting

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
model = AdaBoostRegressor(base_estimator = LinearRegression(), n_estimators=50)
model_performance(model, X_train_new, y_train, X_test_new, y_test)

# linear regression model 3 -- with bagging

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
model = BaggingRegressor(base_estimator = LinearRegression(), n_estimators=100, random_state=0, max_features=7, max_samples=0.5)
model_performance(model, X_train_new, y_train, X_test_new, y_test)

# support vector regression model 4 -- with hyper-parameter tuning

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('regressor', SVR())])
kernels = ['linear', 'poly', 'rbf']
params = [{'regressor__kernel': kernels, 'regressor__C':np.arange(0.1, 2.5, 0.25), }]

gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train_new, np.ravel(y_train))
print('The best score is: ', gridsearch.best_score_)
print('The best parameters are: ', gridsearch.best_params_)

In [None]:
model = SVR(kernel='linear')
model_performance(model, X_train_new, y_train, X_test_new, y_test)

# train model without outliers -- model 5

In [None]:
model = LinearRegression()
model_performance(model, X_train_new, y_train, X_test_new, y_test)

In [None]:
def model_builder(df, features, response = 'cap_usage'):
    """df: dataframe with stats, salaries, and cap usage
    features: is a list of metrics for our regression model to predict salary usage."""
    df_features = df[features]
    X = df_features
    #X = StandardScaler().fit_transform(X)
    y = df[[response]]
    reg = LinearRegression().fit(X, y)
    print('The coefficients are {}'.format(reg.coef_))
    print("the R^2 score is ", reg.score(X, y))
    return reg

def predict_player(test_df, model, name_of_player, features):
    """player_features: must be the same metrics used to build the model in a df"""
    #parameters = model.get_params
    print(model.get_params)
    player_only = test_df[test_df['name'] == name_of_player]
    ready_for_test = player_only[features]
    prediction = model.predict(ready_for_test)[0][0]
    return [name_of_player, prediction], prediction

def predict_all(test_df, model, features):
    test_df_clean = test_df.dropna()
    players = test_df_clean['name'].values
    ready_for_test = test_df_clean[features]
    predictions = model.predict(ready_for_test)
    print('The mean of cap usage predictions for all players is {}'.format(np.mean(predictions)))
    #print(players.shape, predictions.shape)
    return [[players[i], predictions[i, 0]] for i in range(len(players))], [predictions[i, 0] for i in range(len(players))]
def do_all(train_df, test_df, features, response = 'cap_usage', predict_all_bool = False, player_names = ['Steven Adams']):
    model = model_builder(train_df, features, response)
    if predict_all_bool:
        return predict_all(test_df, model, features)
    predictions = []
    values = []
    for player in player_names:
        value = predict_player(test_df, model, player, features)
        predictions.append(value[0])
        values.append(value[1])
    return predictions, values

In [None]:
train_all = pd.concat([X_train_new, y_train], axis=1)
test_all = pd.concat([X_test_new, y_test], axis=1)

In [None]:
#train model (1)
names, values = do_all(p_stats, p_stats, features=X_train_new.columns.values, predict_all_bool = True)

def make_dataframe_of_results(names_values_df, ref_df):
    compare_results = pd.DataFrame(names_values_df, columns=['player_name', 'predicted_cap_usage'])
    compare_results['actual_cap_usage'] = ref_df['cap_usage'].reset_index()['cap_usage']
    compare_results['error'] = compare_results['predicted_cap_usage'] - compare_results['actual_cap_usage']
    compare_results['error_squared'] = compare_results['error']*compare_results['error']
    print('mean squared error is', np.mean(compare_results['error_squared']))
    return compare_results
results = make_dataframe_of_results(names, p_stats)

In [None]:
#train model (2)

#filter out outliers
filter_out = results['error_squared']<=.01
p_stats_2 = p_stats.reset_index()[filter_out]

names, values = do_all(p_stats_2, p_stats_2, features=X_train_new.columns.values, predict_all_bool = True)

def make_dataframe_of_results(names_values_df, ref_df):
    compare_results = pd.DataFrame(names_values_df, columns=['player_name', 'predicted_cap_usage'])
    compare_results['actual_cap_usage'] = ref_df['cap_usage'].reset_index()['cap_usage']
    compare_results['error'] = compare_results['predicted_cap_usage'] - compare_results['actual_cap_usage']
    compare_results['error_squared'] = compare_results['error']*compare_results['error']
    print('mean squared error is', np.mean(compare_results['error_squared']))
    return compare_results
results = make_dataframe_of_results(names, p_stats)

In [None]:
#train model (3)

#filter out outliers
filter_out = results['error_squared']<=.1
p_stats_3 = p_stats_2.reset_index()[filter_out]

names, values = do_all(p_stats_3, p_stats_3, features=X_train_new.columns.values, predict_all_bool = True)

def make_dataframe_of_results(names_values_df, ref_df):
    compare_results = pd.DataFrame(names_values_df, columns=['player_name', 'predicted_cap_usage'])
    compare_results['actual_cap_usage'] = ref_df['cap_usage'].reset_index()['cap_usage']
    compare_results['error'] = compare_results['predicted_cap_usage'] - compare_results['actual_cap_usage']
    compare_results['error_squared'] = compare_results['error']*compare_results['error']
    print('mean squared error is', np.mean(compare_results['error_squared']))
    return compare_results
results = make_dataframe_of_results(names, p_stats_3)

In [None]:
X_train, X_test, y_train, y_test, p_stats_3, stats_2020 = clean_datasets(p_stats_3, stats_2020, 2015)

In [None]:
#see model performance on 2019-20 data
model = LinearRegression()
model_performance(model, X_train_new, y_train, X_test_new, y_test)