In [2]:
from pycaret.classification import *
import pandas as pd
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
import os
import shutil

plt.rcParams['font.family'] = 'Arial'

# Train model

auto extracted data

In [3]:
# Load data
data_path = 'data/train_extract.csv'
experiment_name = 'extract'
data = pd.read_csv(data_path, index_col=0)

# setup
clf = setup(data=data, target='label', use_gpu=True, session_id=123, experiment_name=experiment_name)
best_auto = compare_models(n_select=5)

# save model
os.makedirs('models', exist_ok=True)
save_path = os.path.join('models',experiment_name + '_model')
save_model(best_auto[0], save_path)

# evaluate model
eval_dir = os.path.join('evaluate', experiment_name)
os.makedirs(eval_dir,exist_ok=True)    
cf = plot_model(best_auto[0], plot='confusion_matrix', save=True)
roc = plot_model(best_auto[0], save=True)
feature_importance = plot_model(best_auto[0], plot='feature', save=True)

# move evaluate files to evaluate folder
shutil.move('Confusion Matrix.png', os.path.join(eval_dir, 'confusion_matrix.png'))
shutil.move('AUC.png', os.path.join(eval_dir, 'roc.png'))
shutil.move('Feature Importance.png', os.path.join(eval_dir, 'feature_importance.png'))

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Multiclass
3,Original data shape,"(2810, 17)"
4,Transformed data shape,"(2810, 17)"
5,Transformed train set shape,"(1966, 17)"
6,Transformed test set shape,"(844, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9552,0.9909,0.9552,0.9561,0.9548,0.9388,0.9391,3.5
gbc,Gradient Boosting Classifier,0.9527,0.9901,0.9527,0.9539,0.9524,0.9354,0.9356,3.247
rf,Random Forest Classifier,0.9461,0.9907,0.9461,0.9472,0.9452,0.9263,0.9266,0.203
dt,Decision Tree Classifier,0.9217,0.9496,0.9217,0.9243,0.9217,0.8931,0.8935,0.031
et,Extra Trees Classifier,0.9125,0.9891,0.9125,0.9134,0.9108,0.8799,0.8805,0.171
lda,Linear Discriminant Analysis,0.8718,0.9797,0.8718,0.8721,0.8689,0.8227,0.8242,0.018
ada,Ada Boost Classifier,0.8453,0.9412,0.8453,0.8401,0.8324,0.7881,0.7938,0.205
qda,Quadratic Discriminant Analysis,0.8036,0.9515,0.8036,0.8174,0.8048,0.735,0.7379,0.017
nb,Naive Bayes,0.7492,0.9364,0.7492,0.7707,0.7491,0.6626,0.6666,0.016
knn,K Neighbors Classifier,0.6851,0.9017,0.6851,0.6814,0.6759,0.5617,0.5637,0.077


Transformation Pipeline and Model Successfully Saved


'evaluate\\extract\\feature_importance.png'

manual extrated data

In [4]:
# Load data
data_path = 'data/train_true.csv'
experiment_name = 'true'
data = pd.read_csv(data_path, index_col=0)

# setup
clf = setup(data=data, target='label', use_gpu=True, session_id=123, experiment_name=experiment_name)
best_maunal = compare_models(n_select=5)

# save model
os.makedirs('models', exist_ok=True)
save_path = os.path.join('models',experiment_name + '_model')
save_model(best_maunal[0], save_path)

# evaluate model
eval_dir = os.path.join('evaluate', experiment_name)
os.makedirs(eval_dir,exist_ok=True)    
cf = plot_model(best_maunal[0], plot='confusion_matrix', save=True)
roc = plot_model(best_maunal[0], save=True)
feature_importance = plot_model(best_maunal[0], plot='feature', save=True)

# move evaluate files to evaluate folder
shutil.move('Confusion Matrix.png', os.path.join(eval_dir, 'confusion_matrix.png'))
shutil.move('AUC.png', os.path.join(eval_dir, 'roc.png'))
shutil.move('Feature Importance.png', os.path.join(eval_dir, 'feature_importance.png'))

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Multiclass
3,Original data shape,"(2810, 17)"
4,Transformed data shape,"(2810, 17)"
5,Transformed train set shape,"(1966, 17)"
6,Transformed test set shape,"(844, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9674,0.9886,0.9674,0.9685,0.9673,0.9556,0.9558,0.196
gbc,Gradient Boosting Classifier,0.9654,0.9879,0.9654,0.9664,0.9653,0.9528,0.953,1.407
lightgbm,Light Gradient Boosting Machine,0.9654,0.989,0.9654,0.9663,0.9653,0.9527,0.9529,3.24
et,Extra Trees Classifier,0.9527,0.9884,0.9527,0.9549,0.9523,0.9353,0.9357,0.174
dt,Decision Tree Classifier,0.9298,0.9538,0.9298,0.9319,0.9298,0.9043,0.9048,0.02
knn,K Neighbors Classifier,0.8921,0.9772,0.8921,0.8938,0.8908,0.852,0.8526,0.078
nb,Naive Bayes,0.883,0.9786,0.883,0.8919,0.8843,0.8421,0.8439,0.016
lda,Linear Discriminant Analysis,0.883,0.9803,0.883,0.8872,0.8801,0.8391,0.8403,0.019
ada,Ada Boost Classifier,0.8591,0.8937,0.8591,0.8209,0.8314,0.8067,0.8143,0.135
lr,Logistic Regression,0.8474,0.9689,0.8474,0.8488,0.8451,0.7908,0.7917,0.356


Transformation Pipeline and Model Successfully Saved


'evaluate\\true\\feature_importance.png'

# Test model

In [5]:
# test model
test_path = 'data/test_extract.csv'
test_data = pd.read_csv(test_path, index_col=0)

y_true = test_data['label']
x = test_data.drop('label', axis=1)
gbc_auto = load_model('models/extract_model')
df_auto = predict_model(gbc_auto,  data=test_data)
# label_auto = df_auto['label']
# y_pred_auto = df_auto['prediction_label']
label_auto = df_auto['label'].replace({4:5, 5:4})
y_pred_auto = df_auto['prediction_label'].replace({4:5, 5:4})

test_path = 'data/test_true.csv'
test_data = pd.read_csv(test_path, index_col=0)

y_true = test_data['label']
x = test_data.drop('label', axis=1)
rf_mn = load_model('models/true_model')
df_mn = predict_model(rf_mn,  data=test_data)
label_mn = df_mn['label'].replace({4:5, 5:4})
y_pred_mn = df_mn['prediction_label'].replace({4:5, 5:4})

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9458,0.9877,0.9458,0.9461,0.9457,0.9247,0.9248


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9643,0.9865,0.9643,0.9645,0.9643,0.9506,0.9506
