# Prediction: Various Methods

> **Warning!** Please run `01_cleaning.ipynb` first if you haven't already

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from functions.constants import BM_NAME, STARTDATE, ENDDATE, N_THRESHOLD_BPS, DATA_DIR, EVAL_START_DATE, TEST_START_DATE  # noqa: F401

from functions.helper_fns import featurize_time_series,load_active_returns,evaluate_model_performance,get_X_and_y_values  # noqa: F401
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier


In [None]:
active_returns = load_active_returns()
active_returns.head()

In [None]:
chosen_period = "1w"
chosen_feature_count = 12

X_train, y_train, X_eval, y_eval, X_train_and_eval, y_train_and_eval, X_test, y_test, df_train, df_eval, df_train_and_eval, df_test = get_X_and_y_values(active_returns, chosen_period, chosen_feature_count)
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_eval shape: ", X_eval.shape)
print("y_eval shape: ", y_eval.shape)
print("X_train_and_eval shape: ", X_train_and_eval.shape)
print("y_train_and_eval shape: ", y_train_and_eval.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)
print("df_train shape: ", df_train.shape)
print("df_eval shape: ", df_eval.shape)
print("df_train_and_eval shape: ", df_train_and_eval.shape)
print("df_test shape: ", df_test.shape)

## Training a classifier

In [None]:
def model_generator(model_type, model_types_to_stack_if_stacking = ["LogisticRegression", "RandomForest","XGBoost"]):
    if model_type == "LogisticRegression":
        model = LogisticRegression(class_weight="balanced")
    elif model_type == "KNN":
        model = KNeighborsClassifier(n_neighbors=5) 
    elif model_type == "RandomForest":
        model = RandomForestClassifier(class_weight="balanced", n_estimators=100, max_depth=5)
    elif model_type == "SVC":
        model = SVC(probability=True,max_iter=5000, class_weight="balanced",kernel="rbf")
    elif model_type == "XGBoost":
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    elif model_type == "Stacking":
        estimators_models = []
        print("!!!Stacking the following models!!!")
        for m in model_types_to_stack_if_stacking:
            estimators_models.append((m,model_generator(m)))
            print(m)
        final_estimator = LogisticRegression(class_weight="balanced")
        stack = StackingClassifier(estimators=estimators_models,stack_method = 'predict_proba',n_jobs = -1,final_estimator = final_estimator)
        model = stack
    else:
        raise ValueError(f"Model Type {model_type} is not defined!")
    return model

In [None]:
def train_and_evaluate_model(MODEL_TYPE, NUM_FEATURES, PREDICTION_PERIOD):
    model = model_generator(MODEL_TYPE)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_eval)
    y_pred_proba = model.predict_proba(X_eval)
    y_train_pred = model.predict(X_train)
    y_train_pred_proba = model.predict_proba(X_train)
    print("===TRAINING SET===")
    evaluate_model_performance(y_train, y_train_pred, y_train_pred_proba,PREDICTION_PERIOD,NUM_FEATURES,plot_confusion_matrix=False)
    print("===EVALUATION SET===")
    evaluate_model_performance(y_eval, y_pred, y_pred_proba,PREDICTION_PERIOD,NUM_FEATURES)
    return model #this trained model holds the wisdom we need

def write_eval_predictions_to_csv(model,model_name):
    y_pred = model.predict(X_eval)
    y_pred_proba = model.predict_proba(X_eval)
    df_eval_with_predictions = df_eval.copy()
    df_eval_with_predictions['outperform_{0}_predicted'.format(chosen_period)] = y_pred
    df_eval_with_predictions['outperform_{0}_probability'.format(chosen_period)] = y_pred_proba[:, 1]
    df_eval_with_predictions.to_csv(DATA_DIR + "{0}_eval_predictions_{1}.csv".format(BM_NAME, model_name))

In [None]:
logreg_model = train_and_evaluate_model("LogisticRegression", chosen_feature_count, chosen_period)
write_eval_predictions_to_csv(logreg_model,"LogisticRegression")
logreg_model

In [None]:
knn_model = train_and_evaluate_model("KNN", chosen_feature_count, chosen_period)
write_eval_predictions_to_csv(knn_model,"KNN")
knn_model

In [None]:
randomforest_model = train_and_evaluate_model("RandomForest", chosen_feature_count, chosen_period)
write_eval_predictions_to_csv(randomforest_model,"RandomForest")
randomforest_model

In [None]:
xgboost_model = train_and_evaluate_model("XGBoost", chosen_feature_count, chosen_period)
write_eval_predictions_to_csv(xgboost_model,"XGBoost")
xgboost_model

In [None]:
svc_model = train_and_evaluate_model("SVC", chosen_feature_count, chosen_period)
write_eval_predictions_to_csv(svc_model,"SVC")
svc_model

In [None]:
stacking_model = train_and_evaluate_model("Stacking",12,"1w")
write_eval_predictions_to_csv(stacking_model,"Stacking")
stacking_model

## Choosing the best model based on evaluation dataset performance
### As well as exporting predictions and probability

In [None]:
optimum_model_name = "Stacking"
optimum_model = model_generator(optimum_model_name)

In [None]:
#train on train and eval set
optimum_model.fit(X_train_and_eval, y_train_and_eval)
#predict on test set
y_pred_test = optimum_model.predict(X_test)
y_pred_proba_test = optimum_model.predict_proba(X_test)
print("===TRAINING AND EVAL SET===")
y_train_and_eval_pred = optimum_model.predict(X_train_and_eval)
y_train_and_eval_pred_proba = optimum_model.predict_proba(X_train_and_eval)
evaluate_model_performance(y_train_and_eval, y_train_and_eval_pred, y_train_and_eval_pred_proba,chosen_period,chosen_feature_count,plot_confusion_matrix=False)
print("===TEST SET===")
evaluate_model_performance(y_test, y_pred_test, y_pred_proba_test,chosen_period,chosen_feature_count)

In [None]:
#join the test set with the predictions and probability scores
df_test_with_predictions = df_test.copy()
df_test_with_predictions['outperform_{0}_predicted'.format(chosen_period)] = y_pred_test
df_test_with_predictions['outperform_{0}_probability'.format(chosen_period)] = y_pred_proba_test[:,1]
df_test_with_predictions = df_test_with_predictions.rename(columns={f"ar_{chosen_period}_t":f"outperform_{chosen_period}_actual"})
df_test_with_predictions = df_test_with_predictions[['Date','Ticker',f'outperform_{chosen_period}_actual',f'outperform_{chosen_period}_predicted',f'outperform_{chosen_period}_probability']]
#df_test_with_predictions
#now do same for train and eval set
df_train_and_eval_with_predictions = df_train_and_eval.copy()
df_train_and_eval_with_predictions['outperform_{0}_predicted'.format(chosen_period)] = y_train_and_eval_pred
df_train_and_eval_with_predictions['outperform_{0}_probability'.format(chosen_period)] = y_train_and_eval_pred_proba[:,1]
df_train_and_eval_with_predictions = df_train_and_eval_with_predictions.rename(columns={f"ar_{chosen_period}_t":f"outperform_{chosen_period}_actual"})
df_train_and_eval_with_predictions = df_train_and_eval_with_predictions[['Date','Ticker',f'outperform_{chosen_period}_actual',f'outperform_{chosen_period}_predicted',f'outperform_{chosen_period}_probability']]
df_train_and_eval_with_predictions
df_with_predictions = pd.concat([df_train_and_eval_with_predictions,df_test_with_predictions])
df_with_predictions

### Some Brief Data Exploration

In [None]:
#plot the distribution of probabilities
sns.histplot(df_with_predictions[f'outperform_{chosen_period}_probability'], kde=True)
plt.title(f'Distribution of Probabilities for Outperforming {chosen_period}')
#print mean and stdev on the plot at the top right corner
mean_val = df_with_predictions[f'outperform_{chosen_period}_probability'].mean()
stdev_val = df_with_predictions[f'outperform_{chosen_period}_probability'].std()
plt.text(.75,.75, f"Mean: {mean_val:.2f}\nStdev: {stdev_val:.2f}", horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)

In [None]:
#write to csv in DATA_DIR
save_to_path = f"{DATA_DIR}/{BM_NAME}_{chosen_period}_outperformance_predictions_{optimum_model_name}.csv"
df_with_predictions.to_csv(save_to_path, index=False)
print(f"Saved to {save_to_path}")