In [5]:
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, recall_score, classification_report,confusion_matrix
import glob
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
import time

In [6]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.shape)
print(test_df.shape)

(24756, 579)
(4369, 579)


In [7]:
#features that are dependent on time and the final outcome

X_train, y_train = train_df.drop('state', axis=1), train_df['state']
X_test, y_test = test_df.drop('state', axis=1), test_df['state']

In [8]:
# Combine train and test data set tgt

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

In [9]:
# Check if dataset is balanced
print(train_df[train_df.state == 1].shape)
print(train_df[train_df.state == 0].shape)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(17482, 579)
(7274, 579)
(24756, 578)
(24756,)
(4369, 578)
(4369,)


# Voting Classifier

In [12]:
rf_model = RandomForestClassifier(n_estimators= 150, min_samples_split=2, min_samples_leaf= 4, max_features= 'auto', max_depth= 8, bootstrap= False)
xgb_model = xgb.XGBClassifier(n_estimators= 150, max_depth = 5, learning_rate = 0.1)
logreg_model = LogisticRegression(max_iter=25000, C= 4.207230251572931, penalty= 'l2', solver= 'lbfgs')

In [13]:
start_time = time.time()
voting_clf = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model), ('logreg', logreg_model)])
voting_clf.fit(X_train, y_train)
y_pred_test = voting_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
print(roc_auc_score(y_test, y_pred_test))

  warn(


Total time taken for the program execution 87.574782371521
              precision    recall  f1-score   support

           0    0.92857   0.82137   0.87169      1282
           1    0.92921   0.97376   0.95096      3087

    accuracy                        0.92905      4369
   macro avg    0.92889   0.89757   0.91133      4369
weighted avg    0.92902   0.92905   0.92770      4369

0.8975668939294015


# Stacking Model

In [14]:
start_time = time.time()
estimators = [('rf', rf_model), ('xgb', xgb_model), ('logreg', logreg_model)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_clf.fit(X_train, y_train)
y_pred_test = stacking_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds
print(classification_report(y_test, y_pred_test, digits=5))
print(roc_auc_score(y_test, y_pred_test))

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Total time taken for the program execution 451.02486062049866
              precision    recall  f1-score   support

           0    0.92456   0.86037   0.89131      1282
           1    0.94364   0.97085   0.95705      3087

    accuracy                        0.93843      4369
   macro avg    0.93410   0.91561   0.92418      4369
weighted avg    0.93804   0.93843   0.93776      4369

0.9156099480130808
