# Compile all predictions for deployment

In this notebook we generate predictioned classes and probabilities for the test seasons 2017-2023 and save them as a `csv` for deployment in the `streamlit` site.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from HelperFunctions import *

## Loading the full dataset

In [2]:
#load/scale data
df        = pd.read_csv("full_data.csv")
df_scaled = ImputeAndScale(df.copy())
features  = df_scaled.select_dtypes(include='number').columns.drop(['PLAYER_ID', 'SEASON_START', 'IN_LEAGUE_NEXT'])

## Training the model

Train the model and grab predictions; save them in `df_scaled` and output as `csv`.

In [3]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [4]:
#train model on seasons < year and grab predictions on season = year
for curr_season in range(2017, 2024):
    df_train = df_scaled.loc[df_scaled.SEASON_START < curr_season]

    #split training data into a train set and a calibration set
    df_tt, df_cal = train_test_split(df_train, test_size=0.2, shuffle=True, 
                                     random_state=815, stratify=df_train.IN_LEAGUE_NEXT)

    model = Pipeline([('smote', SMOTE(random_state=23)),
                      ('xgb', XGBClassifier(n_estimators=350, learning_rate=0.005, random_state=206))])

    model.fit(df_tt[features], df_tt.IN_LEAGUE_NEXT)

    model_cal = CalibratedClassifierCV(model, cv="prefit")
    model_cal.fit(df_cal[features], df_cal.IN_LEAGUE_NEXT)

    #save categorical prediction to dataframe
    df_scaled.loc[df_scaled.SEASON_START==curr_season, "PRED"] = model.predict(
        df_scaled.loc[df_scaled.SEASON_START==curr_season][features])
    #save predicted probability to dataframe
    df_scaled.loc[df_scaled.SEASON_START==curr_season, "PROB"] = model_cal.predict_proba(
        df_scaled.loc[df_scaled.SEASON_START==curr_season][features])[:, 1]

In [5]:
df_scaled.loc[df_scaled.SEASON_START >= 2017].to_csv("model_predictions.csv", index=False)