In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.feature_selection import SelectKBest, f_regression
import smogn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error

In [2]:
seeds = np.arange(0, 48)

In [14]:
def preparation(seed):
    df = pd.read_excel("../spring21_data/fa_stats_v3.xlsx")
    df = df[df['poss'] >= 500]
    injury = pd.read_excel("../spring21_data/season_ending_injury.xlsx")
    injury_fa = df.merge(right=injury, how='inner', left_on=['clean_season', 'Player'], right_on=['clean_season', 'Player']).rename(columns={'Unnamed: 0': 'index'})
    df = df[~df['Unnamed: 0'].isin(injury_fa['index'])]
    df = df.select_dtypes(exclude=['object'])
    df.drop(columns=["Unnamed: 0", "Minutes", "salary", "clean_season", "age", "end season_y", "season_end"], inplace=True)

    X = df.drop(columns=["cap_space_usage"])
    y = df["cap_space_usage"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)
    both = pd.concat([X_train, y_train], axis=1).reset_index().drop(columns=["index"])
    smote_data_high = smogn.smoter(data = both,
                            y = 'cap_space_usage',
                            rel_xtrm_type = 'high',
                            rel_thres = 0.8,
                            k = 9,
                            drop_na_col = True,       ## boolean (True or False)
                            drop_na_row = True,
                            samp_method = 'balance')
    smote_data_mid = smogn.smoter(data = both,
                            y = 'cap_space_usage',
                            rel_xtrm_type = 'both',
                            rel_thres = 0.2,
                            k = 9,
                            drop_na_col = True,       ## boolean (True or False)
                            drop_na_row = True,
                            samp_method = 'extreme')
    smote_data_low = smogn.smoter(data = both,
                            y = 'cap_space_usage',
                            rel_xtrm_type = 'both',
                            rel_thres = 0.05,
                            k = 9,
                            drop_na_col = True,       ## boolean (True or False)
                            drop_na_row = True,
                            samp_method = 'balance')
    smote_data = pd.concat([smote_data_high, smote_data_mid, smote_data_low], ignore_index=True).drop_duplicates()
    sm = smote_data.sample(frac=1) ## make sure to shuffle the rows otherwise the cross validation will be scuffed
    y = sm[['cap_space_usage']]
    X = sm.drop(columns=['cap_space_usage'])

    select = SelectKBest(f_regression, k=20)

    best_features = select.fit_transform(X, y)
    feature_scores = pd.concat([pd.Series(X.columns), pd.Series(select.scores_)], axis=1)
    feature_scores.columns = ["features", "score"]
    features = list(feature_scores.sort_values(by=["score"], ascending=False).head(10)['features'])

    X = X[features]

    X = X[features]
    X_test = X_test[features]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=seed)

    return X_train, y_train, X_val, y_val, X_test, y_test

helper functions
run(model, model_name, x_train, x_val) fits the model on the training set, evaluates the fit on the training set and on the validation set.

evaluate(y_pred, y_test) prints out the evaluation metrics of a prediction.

In [5]:
def run(model, model_name, x_train, x_val):
    """ Fits the model and prints out results from calling evaluate on the training and validation set
        parameters:
            model
                the unfitted model, from sklearn (has to have a fit and predict method)
            model_name
                String, the name of the model
            x_train
                DataFrame, the training set
            x_val
                DataFrame, the validation set
                
        returns:
            nothing
        
    """
    model = model.fit(x_train, y_train)
    y_pred = np.clip(model.predict(x_train), 0, 1000) # since negative results are meaningless
    
    print("[{0}] These are the results for the training set.".format(model_name))
    
    evaluate(y_pred, y_train)
    print("--------------------------------------")
    
    print("[{0}] These are the results for the validation set.".format(model_name))
    y_pred = np.clip(model.predict(x_val), 0, 1000) # since negative results are meaningless
    
    evaluate(y_pred, y_val)
    
    print("_____________________________________________")

In [6]:
def evaluate(y_pred, y_test):
    """ Prints out the evaluation metrics of the experiment
        parameters:
            y_pred
                DataFrame or array-like, the predictions
            y_test
                DataFrame or array-like, the actual y-values
                
            these two parameters must have the same dimensions
    """
    print("r^2: ", r2_score(y_test, y_pred))
    print("variance_score: ", explained_variance_score(y_test, y_pred))
    return explained_variance_score(y_test, y_pred)

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_squared_log_error
run(lasso, "Lasso", X_train,  X_val)
run(elasticNet, "ElasticNetCV", X_train,  X_val)
run(lassoLars, "Lasso LARS", X_train,  X_val)

In [15]:
from joblib import Parallel, delayed
def store(seed):
    X_train, y_train, X_val, y_val, X_test, y_test = preparation(seed)
    lassoLars = LassoLarsCV(cv=7)
    lassoLars.fit(X_train, y_train)
    return evaluate(lassoLars.predict(X_test), y_test)

results = Parallel(n_jobs=12)(delayed(store)(seed) for seed in seeds)

In [16]:
results

[0.5607701133590086,
 0.6807372893519879,
 0.7081317709668267,
 0.803911085457428,
 0.6186652849835786,
 0.5536890484472761,
 0.7848786444837904,
 0.5380353218575276,
 0.8211114215036499,
 0.7565450887437738,
 0.5927203211546082,
 0.6014361081723946,
 0.7316900575016609,
 0.7352776267077323,
 0.7589797015928377,
 0.5129405583929796,
 0.7693517375968494,
 0.6800108284730655,
 0.6675079306850489,
 0.7130305659185092,
 0.522698093419063,
 0.7202236966216,
 0.6807109394932493,
 0.5689795129620209,
 0.7895913303657802,
 0.709576283037358,
 0.6156674485058642,
 0.6283996302553395,
 0.6267116143401924,
 0.7254839744767227,
 0.7040583801750749,
 0.7930557440523409,
 0.6352790455782862,
 0.6826987718622513,
 0.470253844190071,
 0.6952940240274981,
 0.6057003365857316,
 0.6226144872130414,
 0.6020772845495697,
 0.8165002938997583,
 0.5713763752225922,
 0.40852732114448453,
 0.3679072582290931,
 0.7201682566737162,
 0.6549800457653094,
 0.6273740757043038,
 0.7591871824269717,
 0.7150787500682314

In [17]:
np.mean(results)

0.6589498855457511

In [None]:
data = pd.read_excel("../spring21_data/fa_stats_v3.xlsx")
test_index = X_test.index
test_data = data.iloc[test_index][['clean_season','Player','MPG', 'attempted_field_goals', 'minutes_played', 'WS', 'Wins Added', 'poss','cap_space_usage']]
results = pd.concat([test_data, pd.DataFrame(index=test_index, data=lassoLars.predict(X_test))], axis=1)


In [None]:
results

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

X_trainPF = poly.fit_transform(X_train)
X_valPF = poly.transform(X_val)
X_valPF = poly.transform(X_val)

lstsq = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV(max_iter=50000)
elasticNet = ElasticNetCV(max_iter=50000, n_alphas = 1000)
lassoLars = LassoLarsCV()

run(ridge, "Ridge", X_trainPF, X_valPF)
run(lstsq, "Least Squares", X_trainPF, X_valPF)
run(lasso, "Lasso", X_trainPF, X_valPF)
run(elasticNet, "ElasticNetCV", X_trainPF, X_valPF)
run(lassoLars, "Lasso LARS", X_trainPF, X_valPF)

In [None]:
evaluate(lassoLars.predict(poly.transform(X_test)), y_test)