In [163]:
import os
import pandas as pd
import numpy as np

In [164]:
df = pd.read_csv('butler.csv')
original = df.copy()
df.drop(list(df.filter(regex = 'RANK')), axis = 1, inplace = True)


In [165]:
df['next_pts'] = df['PTS'].shift(-1)
df['next_asts'] = df['AST'].shift(-1)
df['next_rbs'] = df['REB'].shift(-1)
df['next_fg3'] = df['FG3M'].shift(-1)

In [166]:
df.dtypes[df.dtypes == "object"]
df.drop(["WL", "SEASON_YEAR"], axis=1)

Unnamed: 0,TEAM_ABBREVIATION,GAME_DATE,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM,OPPONENT,HOME,next_pts,next_asts,next_rbs,next_fg3
0,PHI,2022-10-18T00:00:00,37.271667,9,14,0.643,5,9,0.556,12,...,0.2,0.8,0.111,0.889,BOS,0,31.0,9.0,8.0,1.0
1,PHI,2022-10-20T00:00:00,40.331667,13,24,0.542,1,7,0.143,4,...,1.0,0.0,0.308,0.692,MIL,1,12.0,12.0,9.0,1.0
2,PHI,2022-10-22T00:00:00,40.298333,4,18,0.222,1,6,0.167,3,...,1.0,0.0,0.25,0.75,SAS,1,29.0,11.0,9.0,5.0
3,PHI,2022-10-24T00:00:00,34.65,10,18,0.556,5,10,0.5,4,...,0.2,0.8,0.2,0.8,IND,1,18.0,9.0,7.0,3.0
4,PHI,2022-10-26T00:00:00,35.65,5,12,0.417,3,6,0.5,5,...,0.0,1.0,0.0,1.0,TOR,0,11.0,4.0,4.0,0.0
5,PHI,2022-10-28T00:00:00,33.716667,4,9,0.444,0,3,0.0,3,...,0.0,0.0,0.0,1.0,TOR,0,15.0,11.0,6.0,0.0
6,PHI,2022-10-29T00:00:00,35.993333,2,13,0.154,0,5,0.0,11,...,0.0,0.0,0.0,1.0,CHI,0,23.0,17.0,7.0,2.0
7,PHI,2022-10-31T00:00:00,37.616667,8,17,0.471,2,6,0.333,5,...,0.0,1.0,0.0,1.0,WAS,0,24.0,10.0,5.0,3.0
8,PHI,2022-11-02T00:00:00,35.066667,8,18,0.444,3,8,0.375,5,...,0.667,0.333,0.25,0.75,WAS,1,21.0,7.0,4.0,4.0
9,PHI,2022-12-05T00:00:00,38.41,4,19,0.211,4,11,0.364,9,...,0.0,1.0,0.0,1.0,HOU,0,28.0,12.0,4.0,3.0


In [167]:
df['team_code'] = df['TEAM_ABBREVIATION'].astype("category").cat.codes
df['opponent_code'] = df['OPPONENT'].astype("category").cat.codes
fulldf = df.copy()
df = df.dropna()

In [168]:
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler


In [169]:
rr = Ridge(alpha=0.8)
svrmod = SVR(kernel="linear")
split = TimeSeriesSplit(n_splits=5)
mor = MultiOutputRegressor(svrmod)

sfs = SequentialFeatureSelector(rr, n_features_to_select=15, direction="forward", cv=split, n_jobs=4)

In [170]:
removed_cols = ["next_pts", "next_asts", "next_rbs", "next_fg3", "SEASON_YEAR", "TEAM_ABBREVIATION", "OPPONENT", "GAME_DATE", "WL", "team_code"]
selected_cols = df.columns[~df.columns.isin(removed_cols)]
df.to_csv('kcsv.csv', index=False)

In [171]:

scaler = MinMaxScaler()
df.loc[:, selected_cols] = scaler.fit_transform(df[selected_cols])

In [172]:
df.describe()

Unnamed: 0,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,...,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM,HOME,next_pts,next_asts,next_rbs,next_fg3,team_code,opponent_code
count,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,...,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0,57.0
mean,0.540252,0.451754,0.42807,0.422871,0.464912,0.528509,0.514546,0.405104,0.481659,0.733228,...,0.520175,0.297024,0.702976,0.561404,20.719298,10.719298,6.070175,2.736842,0.0,0.498246
std,0.184074,0.200413,0.192497,0.192741,0.264906,0.248901,0.254934,0.241179,0.255959,0.266509,...,0.369681,0.238833,0.238833,0.500626,6.723505,3.473134,2.671674,1.564407,0.0,0.319738
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,4.0,1.0,0.0,0.0,0.0
25%,0.455788,0.333333,0.3,0.309524,0.333333,0.375,0.35014,0.181818,0.272727,0.5,...,0.25,0.133253,0.60024,0.0,16.0,8.0,4.0,2.0,0.0,0.2
50%,0.55352,0.416667,0.4,0.421569,0.5,0.5,0.52521,0.363636,0.454545,0.75,...,0.5,0.30012,0.69988,1.0,20.0,11.0,6.0,3.0,0.0,0.48
75%,0.641944,0.583333,0.55,0.553221,0.666667,0.75,0.70028,0.545455,0.636364,1.0,...,0.8,0.39976,0.866747,1.0,26.0,12.0,8.0,4.0,0.0,0.76
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,38.0,21.0,12.0,6.0,0.0,1.0


In [173]:
sfs3 = SequentialFeatureSelector(svrmod, n_features_to_select=10, direction="forward", cv=split, n_jobs=4)

In [174]:
sfs3.fit(df[selected_cols], df["next_pts"])


In [175]:
predictors = list(selected_cols[sfs3.get_support()])
predictors

['FG3A',
 'FTM',
 'FTA',
 'AST',
 'PCT_TOV',
 'AST_PCT',
 'POSS',
 'PCT_PTS_PAINT',
 'PCT_AST_2PM',
 'PCT_UAST_2PM']

In [176]:
def backtest(data, model, predictors, start=4, step=1):
    allpreds = []
    games = sorted(data["GAME_DATE"].unique())

    for i in range(start, len(games), step):
        current_game = games[i]
        train = data[data["GAME_DATE"] < current_game]
        test = data[data["GAME_DATE"] == current_game]

        model.fit(train[predictors], train[['next_pts', "next_asts", "next_rbs", "next_fg3"]])
        preds = model.predict(test[predictors])
        combined = pd.concat([test[["next_pts", "next_asts", "next_rbs", "next_fg3"]],
                               pd.Series(preds[0][0], index=test.index), 
                               pd.Series(preds[0][1], index=test.index),
                               pd.Series(preds[0][2], index=test.index),
                               pd.Series(preds[0][3], index=test.index),
                               ], axis=1)
        # combined.columns = ["actual", "prediction"]
        allpreds.append(combined)

    return pd.concat(allpreds)

In [186]:
mor.predict(df.iloc[[-1]][predictors])

array([[18.04318222, 11.16414301,  7.4575747 ,  2.1698053 ]])

In [177]:
predictions = backtest(df, mor, predictors)



In [178]:
predictions

Unnamed: 0,next_pts,next_asts,next_rbs,next_fg3,0,1,2,3
4,11.0,4.0,4.0,0.0,24.040584,10.286929,8.60546,2.332827
5,15.0,11.0,6.0,0.0,17.962843,9.916615,8.905287,0.895354
6,23.0,17.0,7.0,2.0,17.817703,9.278068,7.710262,0.919713
7,24.0,10.0,5.0,3.0,18.125444,10.817596,7.00669,1.495855
8,21.0,7.0,4.0,4.0,19.773231,9.65454,6.580749,1.688926
9,28.0,12.0,4.0,3.0,21.561729,8.48855,7.195276,1.95068
10,19.0,16.0,9.0,3.0,22.079511,10.56624,7.023126,2.14959
11,21.0,15.0,7.0,4.0,20.485624,11.235865,6.963821,3.065915
12,27.0,9.0,5.0,4.0,21.060052,11.332331,6.26043,3.63125
13,14.0,8.0,7.0,1.0,21.672283,10.897406,6.627122,2.243705


In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

43.403120260669894

In [None]:
df["next_pts"].describe()

count    57.000000
mean     20.719298
std       6.723505
min       5.000000
25%      16.000000
50%      20.000000
75%      26.000000
max      38.000000
Name: next_pts, dtype: float64

In [None]:
def player_history(df):
    df = df.sort_values("GAME_DATE")
    df["game_number"] = range(0, df.shape[0])
    df['pts_corr'] = list(df[['game_number', 'PTS']].expanding().corr().loc[(slice(None), "game_number"), "PTS"])
    df['pts_corr'].fillna(0, inplace=True)

    df['pts_diff'] = df['PTS'] / df['PTS'].shift(1)
    df['pts_diff'].fillna(df['pts_diff'].mean(), inplace=True)

    df['pts_diff'][df['pts_diff'] == np.inf] = 1

    return df


In [None]:
df = df.groupby("SEASON_YEAR", group_keys=False).apply(player_history)

In [None]:
# pd.Series(svrmod.coef_, index=predictors).sort_values()

In [None]:
diff = predictions['actual'] - predictions['prediction']
diff

4    -13.040584
5     -2.962843
6      5.182297
7      5.874556
8      1.226769
9      6.438271
10    -3.079511
11     0.514376
12     5.939948
13    -7.672283
14    -4.817293
15     1.016507
16     7.496994
17     3.835590
18    -2.544202
19     5.866015
20     4.459769
21    -4.813318
22    -2.074645
23    -4.443277
24     3.888330
25     8.621597
26     2.941438
27   -14.954960
28    -4.591891
29     2.552398
30    -4.036982
31    -3.345244
32     6.916526
33    -5.428503
34   -10.062143
35     6.146851
36    -0.700177
37     8.793128
38     7.703242
39    -2.632472
40    10.472123
41     0.055520
42    -0.768592
43     1.977996
44     7.517355
45    14.550048
46    -8.491760
47    -3.245148
48    -2.350879
49     7.427594
50   -10.105471
51   -13.849010
52    -3.350515
53     3.102199
54    -9.432846
55    -0.199954
dtype: float64