In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
X_train_full = pd.read_csv("../data/train_with_ED_Ticker.csv")
X_train = X_train_full.drop(columns=['Ticker', 'Earnings Date'])
y_train = pd.read_csv("../data/train_labels.csv")

X_test_full = pd.read_csv("../data/test_with_ED_Ticker.csv")
X_test = X_test_full.drop(columns=['Ticker', 'Earnings Date'])
y_test = pd.read_csv("../data/test_labels.csv")

X_train6 = pd.read_csv("../data/alternative_alpha_datasets/Alpha_0.6/train_alpha0.6.csv")
X_test6 = pd.read_csv("../data/alternative_alpha_datasets/Alpha_0.6/test_alpha0.6.csv")


X_train0 = pd.read_csv("../data/alternative_alpha_datasets/Equal Weights/train_ew.csv")
X_test0 = pd.read_csv("../data/alternative_alpha_datasets/Equal Weights/test_ew.csv")

In [None]:
#Stacking Classifier on SMOTEENN Dataset
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline 
svc_model_smote = Pipeline(steps=[('smt', SMOTE(random_state=1, sampling_strategy=1.0)),
                ('enn',
                 EditedNearestNeighbours(n_neighbors=5,
                                         sampling_strategy='majority')),
                ('SVC', SVC(C=632, max_iter=1000))])
lr_model_smote = Pipeline(steps=[('smt', SMOTE(random_state=1, sampling_strategy=1.0)),
                ('enn',
                 EditedNearestNeighbours(n_neighbors=5,
                                         sampling_strategy='majority')),
                ('LR',
                 LogisticRegression(C=0.1, max_iter=300, penalty='l1',
                                    solver='liblinear'))])

xgbc_model_smote = Pipeline(steps=[('smt', SMOTE(random_state=1, sampling_strategy=1.0)),
                ('enn',
                 EditedNearestNeighbours(n_neighbors=5,
                                         sampling_strategy='majority')),
                ('xgbc', XGBClassifier(
                    objective = 'binary:logistic',
                    eval_metric = 'auc',
                    verbosity = 0,
                    tree_method='gpu_hist', #Note: The model is ran using GPU
                    use_label_encoder =False,
                    colsample_bylevel= 1,
                    colsample_bytree = 0.7313746962198359,
                    gamma = 0.001,
                    learning_rate = 0.01,
                    min_child_weight = 0,
                    reg_alpha =  1,
                    reg_lambda = 1,
                    subsample =  0.530755738635484,
                    max_depth = 8,
                    n_estimator = 200,))])
estimators = [
    ('lr', lr_model_smote),
    ('svc', svc_model_smote),
    ('xgb', xgbc_model_smote) #without smote enn here as xgbc performed better without it.
]
sclf_smote = Pipeline(steps=[('smt', SMOTE(random_state=1, sampling_strategy=1.0)),
                ('enn',
                 EditedNearestNeighbours(n_neighbors=5,
                                         sampling_strategy='majority')),
                ('sclf', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))])
sclf_smote.fit(X_train, y_train.values.ravel())

In [None]:
 
svc_model = Pipeline(steps=[('SVC', SVC(C=20, max_iter=1000))])
lr_model = Pipeline(steps=[('LR', LogisticRegression(C=10, max_iter=300, penalty='l1',solver='liblinear'))])

xgbc = XGBClassifier(
    objective = 'binary:logistic',
    eval_metric = 'auc',
    verbosity = 0,
    tree_method='gpu_hist', #Note: The model is ran using GPU
    use_label_encoder =False,
    colsample_bylevel= 1,
    colsample_bytree = 0.7313746962198359,
    gamma = 0.001,
    learning_rate = 0.01,
    min_child_weight = 0,
    reg_alpha =  1,
    reg_lambda = 1,
    subsample =  0.530755738635484,
    max_depth = 8,
    n_estimator = 200,
)
estimators = [
    ('lr', lr_model),
    ('svc', svc_model),
    ('xgb', xgbc) #without smote enn here as xgbc performed better without it.
]
sclf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
sclf.fit(X_train, y_train.values.ravel())

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
y_pred_smote = sclf_smote.predict(X_train)
y_pred = sclf.predict(X_train) 

smote_acc = accuracy_score(y_train, y_pred_smote)
acc = accuracy_score(y_train, y_pred)
print(f"smote train accuracy: {smote_acc}")
print(f"no smote train accuracy: {acc}")

y_pred_smote = sclf_smote.predict(X_test)
y_pred = sclf.predict(X_test) 

smote_acc = accuracy_score(y_test, y_pred_smote)
acc = accuracy_score(y_test, y_pred)
print()
print(f"smote test accuracy: {smote_acc}")
print(f"no smote test accuracy: {acc}")

print()
y_pred_smote_proba = sclf_smote.predict_proba(X_train)[:, 1]
y_pred_proba = sclf.predict_proba(X_train)[:, 1]

smote_roc_auc = roc_auc_score(y_train, y_pred_smote_proba)
roc_auc = roc_auc_score(y_train, y_pred_proba)
print(f"smote train roc auc score: {smote_roc_auc}")
print(f"no smote train roc auc score: {roc_auc}")

y_pred_smote_proba = sclf_smote.predict_proba(X_test)[:, 1]
y_pred_proba = sclf.predict_proba(X_test)[:, 1]

smote_roc_auc = roc_auc_score(y_test, y_pred_smote_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print()
print(f"smote test roc auc score: {smote_roc_auc}")
print(f"no smote test roc auc score: {roc_auc}")

print()
y_pred_smote = sclf_smote.predict(X_train)
y_pred = sclf.predict(X_train) 
smote_f1 = f1_score(y_train, y_pred_smote)
f1 = f1_score(y_train, y_pred)
print(f"smote train f1 score: {smote_f1}")
print(f"no smote train f1 score: {f1}")

y_pred_smote = sclf_smote.predict(X_test)
y_pred = sclf.predict(X_test) 
smote_f1 = f1_score(y_test, y_pred_smote)
f1 = f1_score(y_test, y_pred)
print()
print(f"smote test f1 score: {smote_f1}")
print(f"no smote f1 score: {f1}")

smote train accuracy: 0.7982120051085568
no smote train accuracy: 0.8403575989782887

smote test accuracy: 0.7295918367346939
no smote test accuracy: 0.7653061224489796

smote train roc auc score: 0.7281342840166369
no smote train roc auc score: 0.9253639334521686

smote test roc auc score: 0.6798058712121212
no smote test roc auc score: 0.7987689393939394

smote train f1 score: 0.8698517298187809
no smote train f1 score: 0.891961970613656

smote test f1 score: 0.8262295081967214
no smote f1 score: 0.8456375838926175


In [None]:
#Stacking Classifier on alpha variants Dataset
sclf_alpha0 = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
sclf_alpha2 = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
sclf_alpha6 = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
sclf_alpha0.fit(X_train0, y_train)
sclf_alpha2.fit(X_train, y_train)
sclf_alpha6.fit(X_train6, y_train)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
y_pred_alpha0 = sclf_alpha0.predict(X_train0)
y_pred_alpha2 = sclf_alpha2.predict(X_train) 
y_pred_alpha6 = sclf_alpha6.predict(X_train6)

alpha0_f1 = accuracy_score(y_train, y_pred_alpha0)
alpha2_f1 = accuracy_score(y_train, y_pred_alpha2)
alpha6_f1 = accuracy_score(y_train, y_pred_alpha6)
print(f"alpha0 train accuracy score: {alpha0_f1}")
print(f"alpha2 train accuracy score: {alpha2_f1}")
print(f"alpha6 train accuracy score: {alpha6_f1}")

y_pred_alpha0 = sclf_alpha0.predict(X_test0)
y_pred_alpha2 = sclf_alpha2.predict(X_test) 
y_pred_alpha6 = sclf_alpha6.predict(X_test6)

alpha0_f1 = accuracy_score(y_test, y_pred_alpha0)
alpha2_f1 = accuracy_score(y_test, y_pred_alpha2)
alpha6_f1 = accuracy_score(y_test, y_pred_alpha6)
print()
print(f"alpha0 test accuracy score: {alpha0_f1}")
print(f"alpha2 test accuracy score: {alpha2_f1}")
print(f"alpha6 test accuracy score: {alpha6_f1}")

print()
y_pred_alpha0_proba = sclf_alpha0.predict_proba(X_train0)[:, 1]
y_pred_alpha2_proba = sclf_alpha2.predict_proba(X_train)[:, 1]
y_pred_alpha6_proba = sclf_alpha6.predict_proba(X_train6)[:, 1]

alpha0_f1 = roc_auc_score(y_train, y_pred_alpha0_proba)
alpha2_f1 = roc_auc_score(y_train, y_pred_alpha2_proba)
alpha6_f1 = roc_auc_score(y_train, y_pred_alpha6_proba)
print(f"alpha0 train roc auc score: {alpha0_f1}")
print(f"alpha2 train roc auc score: {alpha2_f1}")
print(f"alpha6 train roc auc score: {alpha6_f1}")

y_pred_alpha0_proba = sclf_alpha0.predict_proba(X_test0)[:, 1]
y_pred_alpha2_proba = sclf_alpha2.predict_proba(X_test)[:, 1]
y_pred_alpha6_proba = sclf_alpha6.predict_proba(X_test6)[:, 1]

alpha0_f1 = roc_auc_score(y_test, y_pred_alpha0_proba)
alpha2_f1 = roc_auc_score(y_test, y_pred_alpha2_proba)
alpha6_f1 = roc_auc_score(y_test, y_pred_alpha6_proba)
print()
print(f"alpha0 test roc auc score: {alpha0_f1}")
print(f"alpha2 test roc auc score: {alpha2_f1}")
print(f"alpha6 test roc auc score: {alpha6_f1}")
print()
y_pred_alpha0 = sclf_alpha0.predict(X_train0)
y_pred_alpha2 = sclf_alpha2.predict(X_train) 
y_pred_alpha6 = sclf_alpha6.predict(X_train6)
alpha0_f1 = f1_score(y_train, y_pred_alpha0)
alpha2_f1 = f1_score(y_train, y_pred_alpha2)
alpha6_f1 = f1_score(y_train, y_pred_alpha6)
print(f"alpha0 train f1 score: {alpha0_f1}")
print(f"alpha2 train f1 score: {alpha2_f1}")
print(f"alpha6 train f1 score: {alpha6_f1}")

y_pred_alpha0 = sclf_alpha0.predict(X_test0)
y_pred_alpha2 = sclf_alpha2.predict(X_test) 
y_pred_alpha6 = sclf_alpha6.predict(X_test6)
alpha0_f1 = f1_score(y_test, y_pred_alpha0)
alpha2_f1 = f1_score(y_test, y_pred_alpha2)
alpha6_f1 = f1_score(y_test, y_pred_alpha6)
print()
print(f"alpha0 test f1 score: {alpha0_f1}")
print(f"alpha2 test f1 score: {alpha2_f1}")
print(f"alpha6 test f1 score: {alpha6_f1}")

alpha0 train accuracy score: 0.8199233716475096
alpha2 train accuracy score: 0.8403575989782887
alpha6 train accuracy score: 0.8378033205619413

alpha0 test accuracy score: 0.75
alpha2 test accuracy score: 0.7653061224489796
alpha6 test accuracy score: 0.7551020408163265

alpha0 train roc auc score: 0.9184640522875815
alpha2 train roc auc score: 0.9253787878787878
alpha6 train roc auc score: 0.9233288770053476

alpha0 test roc auc score: 0.7923768939393939
alpha2 test roc auc score: 0.7988873106060606
alpha6 test roc auc score: 0.7993607954545454

alpha0 train f1 score: 0.8797953964194374
alpha2 train f1 score: 0.891961970613656
alpha6 train f1 score: 0.8902333621434745

alpha0 test f1 score: 0.8372093023255814
alpha2 test f1 score: 0.8456375838926175
alpha6 test f1 score: 0.836734693877551


## Best Performing Dataset for the stacking classifier is No SMOTE-ENN and alpha = 0.2


In [None]:
#Export best model test results for backtesting
prediction_df = pd.DataFrame(y_pred, columns = ["Pred"])
prediction_df.to_csv("stacking_predictions.csv")