In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv("nba_games.csv", index_col=0)
df = df.sort_values("date").reset_index(drop=True)
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

# Function to add target column
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

# Apply the function to create the target variable
df = df.groupby("team", group_keys=False).apply(add_target)
df["target"][pd.isnull(df["target"])] = 2  # Set NaN targets to 2
df["target"] = df["target"].astype(int, errors="ignore")

# Remove columns with all NaN values
nulls = pd.isnull(df).sum()
valid_columns = df.columns[~df.columns.isin(nulls[nulls > 0].index)]
df = df[valid_columns].copy()

# Feature selection and preparation
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

# Step 1: Identify and remove columns with all NaN values
valid_numeric_columns = df[selected_columns].select_dtypes(include=[float, int]).columns
valid_numeric_columns = valid_numeric_columns[df[valid_numeric_columns].notna().any()]

# Step 2: Impute missing values for remaining numeric columns
imputer = SimpleImputer(strategy='mean')
df[valid_numeric_columns] = pd.DataFrame(imputer.fit_transform(df[valid_numeric_columns]), columns=valid_numeric_columns, index=df.index)

# Step 3: Scale the numeric columns
scaler = MinMaxScaler()
df[valid_numeric_columns] = scaler.fit_transform(df[valid_numeric_columns])

# Create the RidgeClassifier and RandomForestClassifier
rr = RidgeClassifier(alpha=1)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Prepare for feature selection
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

# Fit the RidgeClassifier for feature selection
sfs.fit(df[valid_numeric_columns], df["target"])
predictors = list(valid_numeric_columns[sfs.get_support()])

# Backtest function
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    seasons = sorted(data["season"].unique())
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        all_predictions.append(combined)
    return pd.concat(all_predictions)

# Backtest using RidgeClassifier
predictions_rr = backtest(df, rr, predictors)
predictions_rr = predictions_rr[predictions_rr["actual"] != 2]
accuracy_rr = accuracy_score(predictions_rr["actual"], predictions_rr["prediction"])
print(f"RidgeClassifier accuracy: {accuracy_rr:.2f}")

# Backtest using RandomForestClassifier
predictions_rf = backtest(df, rf, predictors)
predictions_rf = predictions_rf[predictions_rf["actual"] != 2]
accuracy_rf = accuracy_score(predictions_rf["actual"], predictions_rf["prediction"])
print(f"RandomForestClassifier accuracy: {accuracy_rf:.2f}")

# Continue with rolling averages and other features as before...
df_rolling = df[list(valid_numeric_columns) + ["won", "team", "season"]]

# Function to find team averages
def find_team_averages(team):
    numeric_columns = team.select_dtypes(include=[float, int]).columns
    rolling = team[numeric_columns].rolling(10).mean()
    return rolling

# Calculate rolling averages
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

# Combine with original data
df = pd.concat([df, df_rolling], axis=1)
df = df.dropna()

# Add shifting columns and predictions as before...
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False)[col_name].shift(-1)

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]],
                left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

# Repeat feature selection for full dataset
sfs.fit(full[selected_columns], full["target"])
predictors = list(selected_columns[sfs.get_support()])

# Backtest again with the full dataset
predictions_full_rr = backtest(full, rr, predictors)
accuracy_full_rr = accuracy_score(predictions_full_rr["actual"], predictions_full_rr["prediction"])
print(f"Full dataset RidgeClassifier accuracy: {accuracy_full_rr:.2f}")

predictions_full_rf = backtest(full, rf, predictors)
accuracy_full_rf = accuracy_score(predictions_full_rf["actual"], predictions_full_rf["prediction"])
print(f"Full dataset RandomForestClassifier accuracy: {accuracy_full_rf:.2f}")