In [8]:
from pycaret.classification import *
import pandas as pd
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
import os
import shutil

In [9]:
plt.rcParams['font.family'] = 'Arial'

# Train model

auto extracted data

In [10]:
# Load data
data_path = 'data/train_extract.csv'
experiment_name = 'extract'
data = pd.read_csv(data_path, index_col=0)

# setup
clf = setup(data=data, target='label', use_gpu=True, session_id=123, experiment_name=experiment_name)
best_ex = compare_models(n_select=5)

# save model
os.makedirs('models', exist_ok=True)
save_path = os.path.join('models',experiment_name + '_model')
save_model(best_ex[0], save_path)

# evaluate model
eval_dir = os.path.join('evaluate', experiment_name)
os.makedirs(eval_dir,exist_ok=True)    
cf = plot_model(best_ex[0], plot='confusion_matrix', save=True)
roc = plot_model(best_ex[0], save=True)
feature_importance = plot_model(best_ex[0], plot='feature', save=True)

# move evaluate files to evaluate folder
shutil.move('Confusion Matrix.png', os.path.join(eval_dir, 'confusion_matrix.png'))
shutil.move('AUC.png', os.path.join(eval_dir, 'roc.png'))
shutil.move('Feature Importance.png', os.path.join(eval_dir, 'feature_importance.png'))

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Multiclass
3,Original data shape,"(2810, 17)"
4,Transformed data shape,"(2810, 17)"
5,Transformed train set shape,"(1966, 17)"
6,Transformed test set shape,"(844, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9542,0.9889,0.9542,0.9552,0.9539,0.9375,0.9377,3.229
lightgbm,Light Gradient Boosting Machine,0.9507,0.9905,0.9507,0.952,0.9501,0.9326,0.9329,2.845
rf,Random Forest Classifier,0.9502,0.9903,0.9502,0.9508,0.9496,0.9319,0.9322,0.207
dt,Decision Tree Classifier,0.9222,0.9499,0.9222,0.9247,0.9218,0.894,0.8947,0.031
et,Extra Trees Classifier,0.9079,0.9893,0.9079,0.9102,0.9067,0.8738,0.8744,0.175
lda,Linear Discriminant Analysis,0.8662,0.9799,0.8662,0.8669,0.8632,0.8153,0.8168,0.018
ada,Ada Boost Classifier,0.8245,0.9435,0.8245,0.8403,0.8196,0.7604,0.7652,0.214
qda,Quadratic Discriminant Analysis,0.8026,0.9475,0.8026,0.8164,0.8042,0.7335,0.7361,0.018
nb,Naive Bayes,0.7477,0.9346,0.7477,0.7679,0.7518,0.6604,0.6632,0.016
lr,Logistic Regression,0.6837,0.9195,0.6837,0.6658,0.6689,0.5594,0.5628,0.357


Transformation Pipeline and Model Successfully Saved


'evaluate\\extract\\feature_importance.png'

manual extrated data

In [11]:
# Load data
data_path = 'data/train_true.csv'
experiment_name = 'true'
data = pd.read_csv(data_path, index_col=0)

# setup
clf = setup(data=data, target='label', use_gpu=True, session_id=123, experiment_name=experiment_name)
best_true = compare_models(n_select=5)

# save model
os.makedirs('models', exist_ok=True)
save_path = os.path.join('models',experiment_name + '_model')
save_model(best_true[0], save_path)

# evaluate model
eval_dir = os.path.join('evaluate', experiment_name)
os.makedirs(eval_dir,exist_ok=True)    
cf = plot_model(best_true[0], plot='confusion_matrix', save=True)
roc = plot_model(best_true[0], save=True)
feature_importance = plot_model(best_true[0], plot='feature', save=True)

# move evaluate files to evaluate folder
shutil.move('Confusion Matrix.png', os.path.join(eval_dir, 'confusion_matrix.png'))
shutil.move('AUC.png', os.path.join(eval_dir, 'roc.png'))
shutil.move('Feature Importance.png', os.path.join(eval_dir, 'feature_importance.png'))

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Multiclass
3,Original data shape,"(2810, 17)"
4,Transformed data shape,"(2810, 17)"
5,Transformed train set shape,"(1966, 17)"
6,Transformed test set shape,"(844, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9649,0.9871,0.9649,0.9662,0.9649,0.9521,0.9523,0.201
gbc,Gradient Boosting Classifier,0.9634,0.9873,0.9634,0.9648,0.9634,0.95,0.9503,1.428
lightgbm,Light Gradient Boosting Machine,0.9624,0.9879,0.9624,0.964,0.9624,0.9486,0.9489,2.823
et,Extra Trees Classifier,0.9496,0.9873,0.9496,0.9516,0.9493,0.9311,0.9315,0.173
dt,Decision Tree Classifier,0.9257,0.9522,0.9257,0.9279,0.9257,0.8988,0.8992,0.02
lda,Linear Discriminant Analysis,0.8876,0.9802,0.8876,0.889,0.8855,0.8453,0.8464,0.019
knn,K Neighbors Classifier,0.8871,0.9754,0.8871,0.8894,0.8855,0.8451,0.8458,0.077
nb,Naive Bayes,0.8809,0.9784,0.8809,0.8906,0.8827,0.8394,0.8412,0.016
lr,Logistic Regression,0.8398,0.968,0.8398,0.8401,0.8371,0.7801,0.781,0.375
ada,Ada Boost Classifier,0.8011,0.884,0.8011,0.8123,0.789,0.7337,0.7443,0.134


Transformation Pipeline and Model Successfully Saved


'evaluate\\true\\feature_importance.png'

# Test model

In [13]:
# test model
test_path = 'data/test_extract.csv'
test_data = pd.read_csv(test_path, index_col=0)

y_true = test_data['label']
x = test_data.drop('label', axis=1)
gbc_auto = load_model('models/extract_model')
df_auto = predict_model(gbc_auto,  data=test_data)
# label_auto = df_auto['label']
# y_pred_auto = df_auto['prediction_label']
label_auto = df_auto['label'].replace({4:5, 5:4})
y_pred_auto = df_auto['prediction_label'].replace({4:5, 5:4})

test_path = 'data/test_true.csv'
test_data = pd.read_csv(test_path, index_col=0)

y_true = test_data['label']
x = test_data.drop('label', axis=1)
rf_mn = load_model('models/true_model')
df_mn = predict_model(rf_mn,  data=test_data)
label_mn = df_mn['label'].replace({4:5, 5:4})
y_pred_mn = df_mn['prediction_label'].replace({4:5, 5:4})

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.9472,0.9873,0.9472,0.9473,0.9471,0.9268,0.9268


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9643,0.9839,0.9643,0.9645,0.9643,0.9505,0.9506
