In [None]:
# IMPORT DATASET

import pandas as pd
# --- Import Dataset 1
dataset = pd.read_csv('11-df_coffee/dataset.csv')
dataset.head(10)
# --- Changing pandas dataframe to numpy array to determine X and y variables
X = dataset.iloc[:,:85].values
y = dataset.iloc[:,-1].values
(_,Xcol) = X.shape
num_features = Xcol
print(num_features)

In [None]:
# DATA PREPROCESSING

# --- Data Normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
# --- Determine classes (y variable) in training set
from sklearn.preprocessing import OneHotEncoder
#ohe = OneHotEncoder()
#y = ohe.fit_transform(y).toarray()
# --- Separating the dataset into training and validation set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# ML Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score, classification_report
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# instantiate the model
clf_gnb = GaussianNB(priors=None,
                     var_smoothing=1e-09)
clf_lr = LogisticRegression(penalty='l2',
                            dual=False,
                            tol=0.0001,
                            C=1.0,
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=None,
                            random_state=None,
                            solver='lbfgs',
                            max_iter=100,
                            multi_class='auto',
                            verbose=0,
                            warm_start=False,
                            n_jobs=None,
                            l1_ratio=None)
clf_dt = DecisionTreeClassifier(criterion='entropy',    # 'gini'
                            splitter='best',
                            max_depth=5,             #None
                            min_samples_split=2,
                            min_samples_leaf=10,         #1 - default
                            min_weight_fraction_leaf=0.0,
                            max_features=None,
                            random_state=None,
                            max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            class_weight=None,
                            ccp_alpha=0.0)
clf_rf = RandomForestClassifier(n_estimators=100,
                                criterion='gini',
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=16,    # default = 1
                                min_weight_fraction_leaf=0.0,
                                max_features='sqrt',
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                bootstrap=True,
                                oob_score=False,
                                n_jobs=None,
                                random_state=None,
                                verbose=0,
                                warm_start=False,
                                class_weight=None,
                                ccp_alpha=0.0,
                                max_samples=None)
clf_knn = KNeighborsClassifier(n_neighbors=5,
                               weights='uniform',
                               algorithm='auto',
                               leaf_size=30,
                               p=2,
                               metric='minkowski',
                               metric_params=None,
                               n_jobs=None)
clf_svm = svm.SVC(C=1.0,
                  kernel='linear',      # default rbf
                  degree=3,
                  gamma='scale',
                  coef0=0.0,
                  shrinking=True,
                  probability=True,     # default False
                  tol=0.001,
                  cache_size=200,
                  class_weight=None,
                  verbose=False,
                  max_iter=-1,
                  decision_function_shape='ovr',
                  break_ties=False,
                  random_state=None)
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
# GAUSSIAN NAIVE-BAYES

# --- fit the model
model_gnb = clf_gnb.fit(X_train, y_train)
pred_gnb = model_gnb.predict(X_test)
# --- Model Accuracy
gnb_acc = accuracy_score(pred_gnb, y_test)
print('Model Accuracy: {:.5f}'.format(gnb_acc))
# --- Confusion Matrix
gnb_conmat = confusion_matrix(pred_gnb, y_test)
print(gnb_conmat)
display = ConfusionMatrixDisplay(gnb_conmat,display_labels=["ARABICA", "ROBUSTA"])
display.plot(cmap="YlGnBu")
display.figure_.suptitle('Confusion Matrix')
plt.gca().invert_yaxis()
plt.show()

gnb_TP = gnb_conmat[0,0]
gnb_TN = gnb_conmat[1,1]
gnb_FP = gnb_conmat[0,1]
gnb_FN = gnb_conmat[1,0]

# --- Performance Metrics
gnb_acc = (gnb_TP+gnb_TN)/ float(gnb_TP+gnb_TN+gnb_FP+gnb_FN)
print('GNB Accuracy : {0:0.5f}'.format(gnb_acc))
gnb_error = (gnb_FP+gnb_FN)/ float(gnb_TP+gnb_TN+gnb_FP+gnb_FN)
print('GNB Error : {0:0.5f}'.format(gnb_error))
gnb_spec = gnb_TN/float(gnb_TN+gnb_FP)
print('GNB Specificity : {0:0.5f}'.format(gnb_spec))
gnb_prec = gnb_TP/float(gnb_TP+gnb_FP)
print('GNB Precision : {0:0.5f}'.format(gnb_prec))
gnb_recall = gnb_TP/float(gnb_TP+gnb_FN)
print('GNB Recall/Sensitivity : {0:0.5f}'.format(gnb_recall))
gnb_f1s = 2*gnb_recall*gnb_prec/float(gnb_recall+gnb_prec)
print('GNB F1-Score : {0:0.5f}'.format(gnb_f1s))
gnb_tpr = gnb_TP/float(gnb_TP+gnb_FN)
print('GNB True Positive Rate : {0:0.5f}'.format(gnb_tpr))
gnb_fpr = gnb_FP/float(gnb_FP+gnb_TN)
print('GNB False Positive Rate : {0:0.5f}'.format(gnb_fpr))
gnb_res = classification_report(pred_gnb, y_test)
print(gnb_res)
gnb_accuracy = metrics.accuracy_score(y_test, pred_gnb)
gnb_precision = metrics.precision_score(y_test, pred_gnb)
gnb_sensitivity_recall = metrics.recall_score(y_test, pred_gnb)
gnb_specificity = metrics.recall_score(y_test, pred_gnb, pos_label=0)
gnb_F1_score = metrics.f1_score(y_test, pred_gnb)

In [None]:
# LOGISTIC REGRESSION

# fit the model
model_lr = clf_lr.fit(X_train, y_train)
pred_lr = model_lr.predict(X_test)
# --- Model Accuracy
lr_acc = accuracy_score(pred_lr, y_test)
print('Model Accuracy: {:.5f}'.format(lr_acc))
# --- Confusion Matrix
lr_conmat = confusion_matrix(pred_lr, y_test)
print(lr_conmat)
display = ConfusionMatrixDisplay(lr_conmat, display_labels=["ARABICA", "ROBUSTA"])
display.plot(cmap="YlGnBu")
display.figure_.suptitle('Confusion Matrix')
plt.gca().invert_yaxis()
plt.show()

lr_TP = lr_conmat[0,0]
lr_TN = lr_conmat[1,1]
lr_FP = lr_conmat[0,1]
lr_FN = lr_conmat[1,0]

# --- Performance Metrics
lr_acc = (lr_TP+lr_TN)/ float(lr_TP+lr_TN+lr_FP+lr_FN)
print('LR Accuracy : {0:0.5f}'.format(lr_acc))
lr_error = (lr_FP+lr_FN)/ float(lr_TP+lr_TN+lr_FP+lr_FN)
print('LR Error : {0:0.5f}'.format(lr_error))
lr_spec = lr_TN/float(lr_TN+lr_FP)
print('LR Specificity : {0:0.5f}'.format(lr_spec))
lr_prec = lr_TP/float(lr_TP+lr_FP)
print('LR Precision : {0:0.5f}'.format(lr_prec))
lr_recall = lr_TP/float(lr_TP+lr_FN)
print('LR Recall/Sensitivity : {0:0.5f}'.format(lr_recall))
lr_f1s = 2*lr_recall*lr_prec/float(lr_recall+lr_prec)
print('LR F1-Score : {0:0.5f}'.format(lr_f1s))
lr_tpr = lr_TP/float(lr_TP+lr_FN)
print('LR True Positive Rate : {0:0.5f}'.format(lr_tpr))
lr_fpr = lr_FP/float(lr_FP+lr_TN)
print('LR False Positive Rate : {0:0.5f}'.format(lr_fpr))
lr_res = classification_report(pred_lr, y_test)
print(lr_res)

In [None]:
# DECISION TREE

# fit the model
model_dt = clf_dt.fit(X_train, y_train)
pred_dt = model_dt.predict(X_test)
# --- Model Accuracy
dt_acc = accuracy_score(pred_dt, y_test)
print('Model Accuracy: {:.5f}'.format(dt_acc))
# --- Confusion Matrix
dt_conmat = confusion_matrix(pred_dt, y_test)
print(dt_conmat)
display = ConfusionMatrixDisplay(dt_conmat, display_labels=["ARABICA", "ROBUSTA"])
display.plot(cmap="YlGnBu")
display.figure_.suptitle('Confusion Matrix')
plt.gca().invert_yaxis()
plt.show()

dt_TP = dt_conmat[0,0]
dt_TN = dt_conmat[1,1]
dt_FP = dt_conmat[0,1]
dt_FN = dt_conmat[1,0]

# --- Performance Metrics
dt_acc = (dt_TP+dt_TN)/ float(dt_TP+dt_TN+dt_FP+dt_FN)
print('DT Accuracy : {0:0.5f}'.format(dt_acc))
dt_error = (dt_FP+dt_FN)/ float(dt_TP+dt_TN+dt_FP+dt_FN)
print('DT Error : {0:0.5f}'.format(dt_error))
dt_spec = dt_TN/float(dt_TN+dt_FP)
print('DT Specificity : {0:0.5f}'.format(dt_spec))
dt_prec = dt_TP/float(dt_TP+dt_FP)
print('DT Precision : {0:0.5f}'.format(dt_prec))
dt_recall = dt_TP/float(dt_TP+dt_FN)
print('DT Recall/Sensitivity : {0:0.5f}'.format(dt_recall))
dt_f1s = 2*dt_recall*dt_prec/float(dt_recall+dt_prec)
print('DT F1-Score : {0:0.5f}'.format(dt_f1s))
dt_tpr = dt_TP/float(dt_TP+dt_FN)
print('DT True Positive Rate : {0:0.5f}'.format(dt_tpr))
dt_fpr = dt_FP/float(dt_FP+dt_TN)
print('DT False Positive Rate : {0:0.5f}'.format(dt_fpr))
dt_res = classification_report(pred_dt, y_test)
print(dt_res)

In [None]:
# RANDOM FOREST

# fit the model
model_rf = clf_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)
# --- Model Accuracy
rf_acc = accuracy_score(pred_rf, y_test)
print('Model Accuracy: {:.5f}'.format(rf_acc))
# --- Confusion Matrix
rf_conmat = confusion_matrix(pred_rf, y_test)
print(rf_conmat)
display = ConfusionMatrixDisplay(rf_conmat, display_labels=["ARABICA", "ROBUSTA"])
display.plot(cmap="YlGnBu")
display.figure_.suptitle('Confusion Matrix')
plt.gca().invert_yaxis()
plt.show()

rf_TP = rf_conmat[0,0]
rf_TN = rf_conmat[1,1]
rf_FP = rf_conmat[0,1]
rf_FN = rf_conmat[1,0]

# --- Performance Metrics
rf_acc = (rf_TP+rf_TN)/ float(rf_TP+rf_TN+rf_FP+rf_FN)
print('RF Accuracy : {0:0.5f}'.format(rf_acc))
rf_error = (rf_FP+rf_FN)/ float(rf_TP+rf_TN+rf_FP+rf_FN)
print('RF Error : {0:0.5f}'.format(rf_error))
rf_spec = rf_TN/float(rf_TN+rf_FP)
print('RF Specificity : {0:0.5f}'.format(rf_spec))
rf_prec = rf_TP/float(rf_TP+rf_FP)
print('RF Precision : {0:0.5f}'.format(rf_prec))
rf_recall = rf_TP/float(rf_TP+rf_FN)
print('RF Recall/Sensitivity : {0:0.5f}'.format(rf_recall))
rf_f1s = 2*rf_recall*rf_prec/float(rf_recall+rf_prec)
print('RF F1-Score : {0:0.5f}'.format(rf_f1s))
rf_tpr = rf_TP/float(rf_TP+rf_FN)
print('RF True Positive Rate : {0:0.5f}'.format(rf_tpr))
rf_fpr = rf_FP/float(rf_FP+rf_TN)
print('RF False Positive Rate : {0:0.5f}'.format(rf_fpr))
rf_res = classification_report(pred_rf, y_test)
print(rf_res)

In [None]:
# fit the model
model_knn = clf_knn.fit(X_train, y_train)
pred_knn = model_knn.predict(X_test)
# --- Model Accuracy
knn_acc = accuracy_score(pred_knn, y_test)
print('Model Accuracy: {:.5f}'.format(knn_acc))
# --- Confusion Matrix
knn_conmat = confusion_matrix(pred_knn, y_test)
print(knn_conmat)
display = ConfusionMatrixDisplay(knn_conmat, display_labels=["ARABICA", "ROBUSTA"])
display.plot(cmap="YlGnBu")
display.figure_.suptitle('Confusion Matrix')
plt.gca().invert_yaxis()
plt.show()

knn_TP = knn_conmat[0,0]
knn_TN = knn_conmat[1,1]
knn_FP = knn_conmat[0,1]
knn_FN = knn_conmat[1,0]

# --- Performance Metrics
knn_acc = (knn_TP+knn_TN)/ float(knn_TP+knn_TN+knn_FP+knn_FN)
print('KNN Accuracy : {0:0.5f}'.format(knn_acc))
knn_error = (knn_FP+knn_FN)/ float(knn_TP+knn_TN+knn_FP+knn_FN)
print('KNN Error : {0:0.5f}'.format(knn_error))
knn_spec = knn_TN/float(knn_TN+knn_FP)
print('KNN Specificity : {0:0.5f}'.format(knn_spec))
knn_prec = knn_TP/float(knn_TP+knn_FP)
print('KNN Precision : {0:0.5f}'.format(knn_prec))
knn_recall = knn_TP/float(knn_TP+knn_FN)
print('KNN Recall/Sensitivity : {0:0.5f}'.format(knn_recall))
knn_f1s = 2*knn_recall*knn_prec/float(knn_recall+knn_prec)
print('KNN F1-Score : {0:0.5f}'.format(knn_f1s))
knn_tpr = knn_TP/float(knn_TP+knn_FN)
print('KNN True Positive Rate : {0:0.5f}'.format(knn_tpr))
knn_fpr = knn_FP/float(knn_FP+knn_TN)
print('KNN False Positive Rate : {0:0.5f}'.format(knn_fpr))
knn_res = classification_report(pred_knn, y_test)
print(knn_res)

In [None]:
# SUPPORT VECTOR MACHINE

# fit the model
model_svm = clf_svm.fit(X_train, y_train)
pred_svm = model_svm.predict(X_test)
# --- Model Accuracy
svm_acc = accuracy_score(pred_svm, y_test)
print('Model Accuracy: {:.5f}'.format(svm_acc))
# --- Confusion Matrix
svm_conmat = confusion_matrix(pred_svm, y_test)
print(svm_conmat)
display = ConfusionMatrixDisplay(svm_conmat, display_labels=["ARABICA", "ROBUSTA"])
display.plot(cmap="YlGnBu")
display.figure_.suptitle('Confusion Matrix')
plt.gca().invert_yaxis()
plt.show()

svm_TP = svm_conmat[0,0]
svm_TN = svm_conmat[1,1]
svm_FP = svm_conmat[0,1]
svm_FN = svm_conmat[1,0]

# --- Performance Metrics
svm_acc = (svm_TP+svm_TN)/ float(svm_TP+svm_TN+svm_FP+svm_FN)
print('SVM Accuracy : {0:0.5f}'.format(svm_acc))
svm_error = (svm_FP+svm_FN)/ float(svm_TP+svm_TN+svm_FP+svm_FN)
print('SVM Error : {0:0.5f}'.format(svm_error))
svm_spec = svm_TN/float(svm_TN+svm_FP)
print('SVM Specificity : {0:0.5f}'.format(svm_spec))
svm_prec = svm_TP/float(svm_TP+svm_FP)
print('SVM Precision : {0:0.5f}'.format(svm_prec))
svm_recall = svm_TP/float(svm_TP+svm_FN)
print('SVM Recall/Sensitivity : {0:0.5f}'.format(svm_recall))
svm_f1s = 2*svm_recall*svm_prec/float(svm_recall+svm_prec)
print('SVM F1-Score : {0:0.5f}'.format(svm_f1s))
svm_tpr = svm_TP/float(svm_TP+svm_FN)
print('SVM True Positive Rate : {0:0.5f}'.format(svm_tpr))
svm_fpr = svm_FP/float(svm_FP+svm_TN)
print('SVM False Positive Rate : {0:0.5f}'.format(svm_fpr))
svm_res = classification_report(pred_svm, y_test)
print(svm_res)

In [None]:
# ROC-AUC Analysis

plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
# --- Naive Bayes ---
gnb_pred_proba = model_gnb.predict_proba(X_test)[::,1]
fpr_gnb, tpr_gnb, _ = roc_curve(y_test,  gnb_pred_proba)
lr_auc = roc_auc_score(y_test, gnb_pred_proba)
plt.plot(fpr_gnb,tpr_gnb,label="Gaussian Naive-Bayes, AUC={:.5f}".format(lr_auc))
# --- Logistic Regression ---
lr_pred_proba = model_lr.predict_proba(X_test)[::,1]
fpr_lr, tpr_lr, _ = roc_curve(y_test,  lr_pred_proba)
lr_auc = roc_auc_score(y_test, lr_pred_proba)
plt.plot(fpr_lr,tpr_lr,label="Logistic Regression, AUC={:.5f}".format(lr_auc))
# --- Decision Tree ---
dt_pred_proba = model_dt.predict_proba(X_test)[::,1]
fpr_dt, tpr_dt, _ = roc_curve(y_test,  dt_pred_proba)
dt_auc = roc_auc_score(y_test, dt_pred_proba)
plt.plot(fpr_dt,tpr_dt,label="Decision Tree, AUC={:.5f}".format(dt_auc))
# --- Random Forest ---
rf_pred_proba = model_rf.predict_proba(X_test)[::,1]
fpr_rf, tpr_rf, _ = roc_curve(y_test,  rf_pred_proba)
rf_auc = roc_auc_score(y_test, rf_pred_proba)
plt.plot(fpr_rf,tpr_rf,label="Random Forest, AUC={:.5f}".format(rf_auc))
# --- K-Nearest Neightbors ---
knn_pred_proba = model_knn.predict_proba(X_test)[::,1]
fpr_knn, tpr_knn, _ = roc_curve(y_test,  knn_pred_proba)
knn_auc = roc_auc_score(y_test, knn_pred_proba)
plt.plot(fpr_knn, tpr_knn,label="K-Nearest Neighbors, AUC={:.5f}".format(knn_auc))
# --- Support Vector Machine ---
svm_pred_proba = model_svm.predict_proba(X_test)[::,1]
fpr_svm, tpr_svm, _ = roc_curve(y_test,  svm_pred_proba)
svm_auc = roc_auc_score(y_test, svm_pred_proba)
plt.plot(fpr_svm, tpr_svm,label="Support Vector Machine, AUC={:.5f}".format(svm_auc))

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=12)
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=12)
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=12)
plt.legend(prop={'size': 9},loc=4)  #loc='lower right'
plt.show()

#https://www.imranabdullah.com/2019-06-01/Drawing-multiple-ROC-Curves-in-a-single-plot
#https://www.geeksforgeeks.org/multiclass-receiver-operating-characteristic-roc-in-scikit-learn/
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#load-and-prepare-data
