In [102]:
import os
import pandas as pd
import numpy as np

In [103]:
df = pd.read_csv('butler.csv')
original = df.copy()
df.drop(list(df.filter(regex = 'RANK')), axis = 1, inplace = True)


In [104]:
df['next_pts'] = df['PTS'].shift(-1)
df['next_asts'] = df['AST'].shift(-1)
df['next_rbs'] = df['REB'].shift(-1)
df['next_fg3'] = df['FG3M'].shift(-1)

In [105]:
df.dtypes[df.dtypes == "object"]
df.drop(["WL", "SEASON_YEAR"], axis=1)

Unnamed: 0,TEAM_ABBREVIATION,GAME_DATE,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM,OPPONENT,HOME,next_pts,next_asts,next_rbs,next_fg3
0,BKN,2021-10-19T00:00:00,30.633333,6,16,0.375,4,8,0.500,4,...,0.000,1.000,0.000,1.000,MIL,0,20.0,8.0,7.0,3.0
1,BKN,2021-10-22T00:00:00,38.416667,7,17,0.412,3,7,0.429,3,...,0.333,0.667,0.286,0.714,PHI,0,15.0,8.0,7.0,2.0
2,BKN,2021-10-24T00:00:00,33.146667,6,16,0.375,2,8,0.250,1,...,0.000,1.000,0.000,1.000,CHA,1,14.0,9.0,6.0,1.0
3,BKN,2021-10-25T00:00:00,30.406667,5,17,0.294,1,8,0.125,3,...,0.000,1.000,0.000,1.000,WAS,1,14.0,7.0,7.0,3.0
4,BKN,2021-10-27T00:00:00,35.600000,4,12,0.333,3,8,0.375,3,...,0.333,0.667,0.500,0.500,MIA,1,29.0,8.0,8.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,PHI,2023-03-29T00:00:00,38.350000,4,14,0.286,2,6,0.333,5,...,0.500,0.500,0.250,0.750,DAL,1,23.0,11.0,3.0,3.0
119,PHI,2023-03-31T00:00:00,37.100000,9,14,0.643,3,6,0.500,2,...,0.000,1.000,0.111,0.889,TOR,1,11.0,6.0,2.0,1.0
120,PHI,2023-04-02T00:00:00,34.300000,4,9,0.444,1,3,0.333,2,...,0.000,1.000,0.250,0.750,MIL,0,20.0,10.0,5.0,4.0
121,PHI,2023-04-04T00:00:00,40.016667,7,17,0.412,4,9,0.444,2,...,0.000,1.000,0.000,1.000,BOS,1,14.0,4.0,4.0,2.0


In [106]:
df['team_code'] = df['TEAM_ABBREVIATION'].astype("category").cat.codes
df['opponent_code'] = df['OPPONENT'].astype("category").cat.codes
fulldf = df.copy()
df = df.dropna()

In [107]:
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler


In [108]:
rr = Ridge(alpha=0.8)
svrmod = SVR(kernel="linear")
split = TimeSeriesSplit(n_splits=5)
mor = MultiOutputRegressor(svrmod)

sfs = SequentialFeatureSelector(rr, n_features_to_select=10, direction="forward", cv=split, n_jobs=4)

In [109]:
removed_cols = ["next_pts", "next_asts", "next_rbs", "next_fg3", "SEASON_YEAR", "TEAM_ABBREVIATION", "OPPONENT", "GAME_DATE", "WL", "team_code"]
selected_cols = df.columns[~df.columns.isin(removed_cols)]
df.to_csv('kcsv.csv', index=False)

In [110]:

scaler = MinMaxScaler()
df.loc[:, selected_cols] = scaler.fit_transform(df[selected_cols])

In [111]:
df.describe()

Unnamed: 0,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,...,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM,HOME,next_pts,next_asts,next_rbs,next_fg3,team_code,opponent_code
count,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,...,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0
mean,0.546239,0.381148,0.446721,0.397943,0.419399,0.459016,0.476225,0.336928,0.366803,0.873574,...,0.603721,0.274713,0.725287,0.516393,21.540984,10.467213,6.934426,2.5,0.639344,0.482193
std,0.168107,0.17655,0.203861,0.1795,0.252838,0.197952,0.249375,0.187723,0.204859,0.146568,...,0.370552,0.224154,0.224154,0.501792,7.433615,3.173195,2.77753,1.511663,0.482171,0.296499
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,4.0,1.0,0.0,0.0,0.0
25%,0.462104,0.285714,0.3,0.295518,0.166667,0.363636,0.35014,0.210526,0.2,0.8,...,0.333,0.030012,0.60024,0.0,16.0,8.0,5.0,1.0,0.0,0.206897
50%,0.562163,0.357143,0.4,0.40056,0.5,0.454545,0.488095,0.315789,0.35,0.889,...,0.667,0.271909,0.728091,1.0,21.0,10.5,7.0,3.0,1.0,0.448276
75%,0.656534,0.5,0.55,0.5,0.5,0.613636,0.637255,0.473684,0.5,1.0,...,1.0,0.39976,0.969988,1.0,26.75,12.0,9.0,3.0,1.0,0.724138
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,39.0,21.0,15.0,6.0,1.0,1.0


In [112]:
sfs3 = SequentialFeatureSelector(svrmod, n_features_to_select=15, direction="forward", cv=split, n_jobs=4)

In [113]:
sfs3.fit(df[selected_cols], df["next_pts"])


In [114]:
predictors = list(selected_cols[sfs3.get_support()])
predictors

['OREB',
 'AST',
 'TOV',
 'TD3',
 'PCT_TOV',
 'PCT_PFD',
 'PTS_PAINT',
 'OPP_PTS_2ND_CHANCE',
 'E_NET_RATING',
 'AST_PCT',
 'OREB_PCT',
 'POSS',
 'PCT_FGA_2PT',
 'PCT_PTS_2PT_MR',
 'PCT_PTS_3PT']

In [115]:
def backtest(data, model, predictors, start=4, step=1):
    allpreds = []
    games = sorted(data["GAME_DATE"].unique())

    for i in range(start, len(games), step):
        current_game = games[i]
        train = data[data["GAME_DATE"] < current_game]
        test = data[data["GAME_DATE"] == current_game]

        model.fit(train[predictors], train[['next_pts', "next_asts", "next_rbs", "next_fg3"]])
        preds = model.predict(test[predictors])
        combined = pd.concat([test[["next_pts", "next_asts", "next_rbs", "next_fg3"]],
                               pd.Series(preds[0][0], index=test.index), 
                               pd.Series(preds[0][1], index=test.index),
                               pd.Series(preds[0][2], index=test.index),
                               pd.Series(preds[0][3], index=test.index),
                               ], axis=1)
        # combined.columns = ["actual", "prediction"]
        allpreds.append(combined)

    return pd.concat(allpreds)

In [116]:
predictions = backtest(df[0:30], mor, predictors)



In [117]:
results = pd.Series(mor.predict(df.iloc[[30]][predictors])[0])
results = pd.DataFrame(results).transpose()
results.columns = ["PPTS", "PAST", "PREB", "PFG3M"]
results

Unnamed: 0,PPTS,PAST,PREB,PFG3M
0,20.31061,9.145419,6.711265,2.609242


In [118]:
original.iloc[[30]][["PTS", "AST", "REB", "FG3M"]]

Unnamed: 0,PTS,AST,REB,FG3M
30,19,8,4,2


In [119]:
predictions

Unnamed: 0,next_pts,next_asts,next_rbs,next_fg3,0,1,2,3
4,29.0,8.0,8.0,3.0,14.682115,7.859246,6.941992,2.480074
5,18.0,12.0,10.0,4.0,15.225925,7.754183,7.282236,2.835732
6,16.0,11.0,4.0,5.0,16.452186,8.149068,7.397497,2.814714
7,13.0,10.0,10.0,3.0,16.468611,8.642572,6.935167,3.665297
8,28.0,8.0,10.0,3.0,15.711196,10.575323,5.826295,3.885862
9,14.0,5.0,8.0,3.0,15.033118,8.353197,6.676791,3.12933
10,17.0,11.0,11.0,1.0,15.507268,8.612269,7.860853,2.694987
11,39.0,12.0,5.0,6.0,15.787785,7.728209,7.14433,3.456029
12,16.0,13.0,6.0,1.0,16.371038,9.550721,6.979943,2.909883
13,24.0,4.0,4.0,2.0,15.449968,9.493286,6.401256,2.496984


In [120]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["next_pts"], predictions[0])

101.8558920337355

In [121]:
df["next_pts"].describe()

count    122.000000
mean      21.540984
std        7.433615
min        4.000000
25%       16.000000
50%       21.000000
75%       26.750000
max       39.000000
Name: next_pts, dtype: float64

In [122]:
def player_history(df):
    df = df.sort_values("GAME_DATE")
    df["game_number"] = range(0, df.shape[0])
    df['pts_corr'] = list(df[['game_number', 'PTS']].expanding().corr().loc[(slice(None), "game_number"), "PTS"])
    df['pts_corr'].fillna(0, inplace=True)

    df['pts_diff'] = df['PTS'] / df['PTS'].shift(1)
    df['pts_diff'].fillna(df['pts_diff'].mean(), inplace=True)

    df['pts_diff'][df['pts_diff'] == np.inf] = 1

    return df


In [123]:
df = df.groupby("SEASON_YEAR", group_keys=False).apply(player_history)

In [124]:
# pd.Series(svrmod.coef_, index=predictors).sort_values()

In [125]:
diff = predictions['actual'] - predictions['prediction']
diff

KeyError: 'actual'