In [1]:
import os
import sys
import time
import datetime
import socket
import yaml
import shutil

import pandas as pd
import numpy as np

from sklearn.externals import joblib 

from sklearn.metrics import accuracy_score, log_loss, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
folderFeature = 'feature'
feaPaths = ['../xprmt/'+folderFeature+'/extract-haralick-DataPH2_lesion_hairremove_augmented-mlbox-20181117-145539/DataPH2_lesion_hairremove_augmented',
           '../xprmt/'+folderFeature+'/extract-slbp-DataPH2_lesion_hairremove_augmented-mlbox-20181117-144735/DataPH2_lesion_hairremove_augmented',
           '../xprmt/'+folderFeature+'/extract-wtio-DataPH2_lesion_hairremove_augmented-mlbox-20181119-082251/DataPH2_lesion_hairremove_augmented',
          ]

featureList = []
labelList = []
for feaPath in feaPaths:
    print(feaPath)
    xListL = []
    yListL = []
    yLi = os.listdir(feaPath)
    sumdat = 0
    for i in yLi:
        file = os.listdir(feaPath+ "/" + i)
        cnt = 0
        for j in file:
            if j.endswith('.pkl'):
                #print(j)
                x = joblib.load(feaPath + "/" + i + "/" +j)
                xListL.append(x)
                yListL.append(i)
                cnt += 1
        print(str(i) + " " + str(cnt))
        sumdat += cnt
    
    print("semuanya "+ str(sumdat))
    featureList.append(xListL)
    labelList.append(yListL)

xList = np.concatenate((featureList[0], featureList[1]), axis=1)
for idx in range(len(featureList)-2):
    xList = np.concatenate((xList, featureList[idx+2]), axis=1)

xList1 = xList
yList1 = labelList[0]

../xprmt/feature/extract-haralick-DataPH2_lesion_hairremove_augmented-mlbox-20181117-145539/DataPH2_lesion_hairremove_augmented
Melanoma 80
Atypical Nevus 80
Common Nevus 80
semuanya 240
../xprmt/feature/extract-slbp-DataPH2_lesion_hairremove_augmented-mlbox-20181117-144735/DataPH2_lesion_hairremove_augmented
Melanoma 80
Atypical Nevus 80
Common Nevus 80
semuanya 240
../xprmt/feature/extract-wtio-DataPH2_lesion_hairremove_augmented-mlbox-20181119-082251/DataPH2_lesion_hairremove_augmented
Melanoma 80
Atypical Nevus 80
Common Nevus 80
semuanya 240


In [3]:
xList = np.array(xList1)
yList = np.array(yList1)
print(xList.shape)
print(yList.shape)

(240, 2079)
(240,)


In [4]:
from sklearn.preprocessing import normalize
xList = normalize(xList, norm='l2', axis=0)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(xList, yList, random_state = 42, test_size = 0.2)

In [6]:
print(X_train.shape)

(192, 2079)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.metrics import sensitivity_specificity_support
clf = RandomForestClassifier(random_state=6)
clf.fit(X_train, y_train)
prediksi = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, prediksi))
print(classification_report(y_test, prediksi))
print(sensitivity_specificity_support(y_test, prediksi, average='macro'))

0.7083333333333334
                precision    recall  f1-score   support

Atypical Nevus       0.52      0.80      0.63        15
  Common Nevus       0.73      0.47      0.57        17
      Melanoma       1.00      0.88      0.93        16

     micro avg       0.71      0.71      0.71        48
     macro avg       0.75      0.72      0.71        48
  weighted avg       0.75      0.71      0.71        48

(0.7151960784313726, 0.8566308243727598, None)


In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
from sklearn.model_selection import cross_val_score
score = cross_val_score(clf, xList, yList, cv=10)
print(score)
print(score.mean())

[0.79166667 0.75       0.66666667 0.66666667 0.70833333 0.70833333
 0.75       0.625      0.58333333 0.66666667]
0.6916666666666667


In [9]:
clf1 = RandomForestClassifier(random_state=1)
clf2 = GradientBoostingClassifier()
clf3 = LinearDiscriminantAnalysis()
eclf1 = VotingClassifier(estimators=[('rf', clf1), ('gb', clf2), ('lda', clf3)], voting='hard')
score = cross_val_score(clf, xList, yList, cv=10)
print(score)
print(score.mean())

[0.75       0.70833333 0.54166667 0.66666667 0.66666667 0.66666667
 0.79166667 0.54166667 0.66666667 0.79166667]
0.6791666666666668


In [10]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", probability=True),
    #NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(random_state=1),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
    ######################### perform 10 fold validation ######################
    kf = StratifiedKFold(n_splits=10)
    scorelist = []
    sensitivity = []
    specificity = []
    f1 = []
    for train_index, test_index in kf.split(xList, yList):
        clf.fit(xList[train_index], yList[train_index])
        p = clf.predict(xList[test_index])
        accval = accuracy_score(yList[test_index], p)
        scorelist.append(accval)
        sss = sensitivity_specificity_support(yList[test_index], p, average='macro')
        sensitivity.append(sss[0])
        specificity.append(sss[1])
        f1.append(f1_score(yList[test_index], p, average='macro') )

        
    print("MeanCVScore: {}".format(sum(scorelist)/len(scorelist)))
    print("10FoldCVScore: {}".format(scorelist))
    print("sensitivity: {}".format(sum(sensitivity)/len(sensitivity)))
    print("specificity: {}".format(sum(specificity)/len(specificity)))
    print("f1-score: {}".format(sum(f1)/len(f1)))

    #############################################################################
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 70.8333%
Log Loss: 7.456073234707138
MeanCVScore: 0.6458333333333333
10FoldCVScore: [0.7083333333333334, 0.6666666666666666, 0.6666666666666666, 0.5416666666666666, 0.75, 0.625, 0.6666666666666666, 0.625, 0.625, 0.5833333333333334]
sensitivity: 0.6458333333333333
specificity: 0.8229166666666666
f1-score: 0.6461202777843644
SVC
****Results****
Accuracy: 72.9167%
Log Loss: 0.5683267614839965
MeanCVScore: 0.7083333333333333
10FoldCVScore: [0.7083333333333334, 0.625, 0.75, 0.75, 0.7916666666666666, 0.75, 0.7083333333333334, 0.625, 0.6666666666666666, 0.7083333333333334]
sensitivity: 0.7083333333333333
specificity: 0.8541666666666666
f1-score: 0.7086016511867905
DecisionTreeClassifier
****Results****
Accuracy: 52.0833%
Log Loss: 16.549830355894702
MeanCVScore: 0.6875000000000001
10FoldCVScore: [0.6666666666666666, 0.5, 0.7083333333333334, 0.75, 0.7083333333333334, 0.625, 0.75, 0.75, 0.625, 0.7916666666666666]
sensitivity: 0.6875000000000001
spe

In [11]:
# clf1 = RandomForestClassifier(random_state=1)
# clf2 = GradientBoostingClassifier()
# clf3 = LinearDiscriminantAnalysis()
# eclf1 = VotingClassifier(estimators=[('rf', clf1), ('gb', clf2), ('lda', clf3)], voting='hard')
# score = cross_val_score(clf, xList, yList, cv=10)
# print(score)
# print(score.mean())

In [12]:
# cm1 = confusion_matrix(yList[test_index],p)
# print('Confusion Matrix : \n', cm1)

# total1=sum(sum(cm1))
# #####from confusion matrix calculate accuracy
# accuracy1=(cm1[0,0]+cm1[1,1])/total1
# print ('Accuracy : ', accuracy1)

# sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
# print('Sensitivity : ', sensitivity1 )

# specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
# print('Specificity : ', specificity1)

# print(classification_report(yList[test_index], p))
    
# score = cross_val_score(clf, xList, yList, cv=10)
# print("MeanCVScore: {}".format(score.mean()))
# print("10FoldCVScore: {}".format(score))

In [13]:
# feaPath = '../xprmt/feature/extract-slbp-melanoma_binary_augmented-mlbox-20181117-151136/melanoma_binary_augmented'

# xListL = []
# yListL = []
# yLi = os.listdir(feaPath)
# sumdat = 0
# for i in yLi:
#     file = os.listdir(feaPath+ "/" + i)
#     cnt = 0
#     for j in file:
#         if j.endswith('.pkl'):
#             #print(j)
#             x = joblib.load(feaPath + "/" + i + "/" +j)
#             xListL.append(x)
#             yListL.append(i)
#             cnt += 1
#     print(str(i) + " " + str(cnt))
#     sumdat += cnt
# print("semuanya "+ str(sumdat))
# feaPath = '../xprmt/feature/extract-haralick-melanoma_binary_augmented-mlbox-20181117-151626/melanoma_binary_augmented'

# xListH = []
# yListH = []
# yLi = os.listdir(feaPath)
# sumdat = 0
# for i in yLi:
#     file = os.listdir(feaPath+ "/" + i)
#     cnt = 0
#     for j in file:
#         if j.endswith('.pkl'):
#             #print(j)
#             x = joblib.load(feaPath + "/" + i + "/" +j)
#             xListH.append(x)
#             yListH.append(i)
#             cnt += 1
#     print(str(i) + " " + str(cnt))
#     sumdat += cnt
# print("semuanya "+ str(sumdat))
# feaPath = '../xprmt/feature/extract-wtio-melanoma_binary_augmented-mlbox-20181117-145744/melanoma_binary_augmented'

# xListW = []
# yListW = []
# yLi = os.listdir(feaPath)
# sumdat = 0
# for i in yLi:
#     file = os.listdir(feaPath+ "/" + i)
#     cnt = 0
#     for j in file:
#         if j.endswith('.pkl'):
#             #print(j)
#             x = joblib.load(feaPath + "/" + i + "/" +j)
#             xListW.append(x)
#             yListW.append(i)
#             cnt += 1
#     print(str(i) + " " + str(cnt))
#     sumdat += cnt
# print("semuanya "+ str(sumdat))
# xListL = np.array(xListL)
# xListH = np.array(xListH)
# xListW = np.array(xListW)
# xList1 = np.concatenate((xListL, xListH, xListW), axis=1)
# #xList = np.concatenate((xListL, xListW), axis=1)
# print(yListL == yListH)
# print(yListH == yListW)
# yList1 = yListL