In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("nba_games.csv", index_col=0)

In [4]:
df = df.sort_values("date")

In [5]:
df = df.reset_index(drop=True)

In [6]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [7]:
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

In [8]:
df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [9]:
df["won"].value_counts()

False    8886
True     8886
Name: won, dtype: int64

In [10]:
df["target"].value_counts()

1    8872
0    8870
2      30
Name: target, dtype: int64

In [11]:
nulls = pd.isnull(df).sum()

In [12]:
nulls = nulls[nulls > 0]

In [13]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [14]:
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=142)

In [15]:
df = df[valid_columns].copy()

In [16]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [17]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [19]:
sfs.fit(df[selected_columns], df["target"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [20]:
predictors = list(selected_columns[sfs.get_support()])

In [21]:
def backtest(data, model, predictors, start=1, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [22]:
predictions = backtest(df, rr, predictors)

In [25]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions["actual"],predictions["prediction"])

0.5396961690885073

In [28]:
df.groupby("home").apply(lambda x: x[x["won"] ==1].shape[0] / x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [29]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

In [30]:
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

  rolling = team.rolling(10).mean()


In [31]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

In [32]:
df = df.dropna()

In [33]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [34]:
df = df.copy()

In [36]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]],
                left_on=["team", "date_next"], 
                right_on=["team_opp_next", "date_next"]
            )

In [37]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [38]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [39]:
sfs.fit(full[selected_columns], full["target"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [40]:
predictors = list(selected_columns[sfs.get_support()])

In [41]:
predictions = backtest(full, rr, predictors)

In [42]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6264979531075549