In [1]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, recall_score, classification_report,confusion_matrix
import glob
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
import time

In [5]:
train_path = max(glob.glob('../data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('../data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(24756, 579)
(4369, 579)


In [6]:
#features that are dependent on time and the final outcome
X_train, y_train = train_df.drop('state', axis=1), train_df['state']
X_test, y_test = test_df.drop('state', axis=1), test_df['state']

In [7]:
# Combine train and test data set tgt
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

# Voting Classifier

In [3]:
rf_model = RandomForestClassifier(n_estimators= 150, min_samples_split=5, min_samples_leaf= 6, max_features= 'sqrt', max_depth= 10, bootstrap= False)
xgb_model = xgb.XGBClassifier(n_estimators= 150, max_depth = 5, learning_rate = 0.1, subsample = 0.7, colsample_bytree = 0.7)
logreg_model = LogisticRegression(max_iter=25000, C= 1.601, penalty= 'l2', solver= 'lbfgs')

In [9]:
start_time = time.time()
voting_clf = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model), ('logreg', logreg_model)], voting = 'soft') # soft voting to generate roc curve
voting_clf.fit(X_train, y_train)
y_pred_test = voting_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
print(roc_auc_score(y_test, y_pred_test))

Total time taken for the program execution 42.055028676986694
              precision    recall  f1-score   support

           0    0.83017   0.68253   0.74914      1282
           1    0.87722   0.94201   0.90847      3087

    accuracy                        0.86587      4369
   macro avg    0.85370   0.81227   0.82880      4369
weighted avg    0.86342   0.86587   0.86172      4369

0.8122711011453091


# Stacking Model

In [10]:
start_time = time.time()
estimators = [('rf', rf_model), ('xgb', xgb_model), ('logreg', logreg_model)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_clf.fit(X_train, y_train)
y_pred_test = stacking_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
print(roc_auc_score(y_test, y_pred_test))

Total time taken for the program execution 202.1307713985443
              precision    recall  f1-score   support

           0    0.80509   0.71529   0.75754      1282
           1    0.88700   0.92809   0.90708      3087

    accuracy                        0.86564      4369
   macro avg    0.84604   0.82169   0.83231      4369
weighted avg    0.86296   0.86564   0.86320      4369

0.8216870657333581
