In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
import pandas as pd
import numpy as np
import time
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, auc
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, roc_curve
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from itertools import cycle
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# training and test
windowList = ['02', '03', '08', '13', '18', '23']
modelMLList = ['dt', 'knn', 'rf', 'mlp', 'lstm']
featList = ['feat1', 'feat2', 'feat3']
labelList = ['multiclass', 'binary', 'atk_1', 'atk_2', 'atk_4', 'atk_8', 'atk_16']
np.random.shuffle(windowList)
np.random.shuffle(modelMLList)
np.random.shuffle(featList)
np.random.shuffle(labelList)
print(windowList)
print(modelMLList)
print(featList)
print(labelList)
for window in windowList:
  for modelML in modelMLList:
    simPath = "/content/drive/MyDrive/dataset/veremi/veremiWithT2/simulationscsv"
    globecomPath = "/content/drive/MyDrive/globecom"
    dismissPath = globecomPath+"/dismiss"
    windowPath = dismissPath+"/"+window+"bsm"
    preprocPath = windowPath+"/preprocessing"
    allMsgPath = preprocPath+"/allmsg"
    modelsPath = windowPath+"/models"
    modelPath = modelsPath+"/"+modelML
    resultsPath = dismissPath+"/results"
    paths = [simPath, globecomPath, dismissPath, windowPath, preprocPath, allMsgPath, modelsPath, modelPath, resultsPath]
    for p in paths:
      if os.path.exists(p) == False:
        os.mkdir(p)
    for feat in featList:
      for label in labelList:
       
        name = window + modelML + label + feat
        # build performance file
        if os.path.exists(resultsPath+'/performance.csv'):
          os.chdir(resultsPath)
          performance = pd.read_csv('performance.csv', index_col=0)
        else:
          dataPerformance = {}
          performance = pd.DataFrame.from_dict(dataPerformance, orient='index', columns=['precision', 'recall', 'f1score', 'accuracy'])

        # build time of training file
        if os.path.exists(resultsPath+'/time.csv'):
          os.chdir(resultsPath)
          timeRecord = pd.read_csv('time.csv', index_col=0)
        else:
          dataTime = {}
          timeRecord = pd.DataFrame.from_dict(dataTime, orient='index', columns=['training', 'testing'])
        
        if (name not in performance.index.values) or (name not in timeRecord.index.values):
          display(performance)
          print("-"*70)
          print(f'Processing {name}...')

          # load the data
          if os.path.exists(allMsgPath+"/allMsg.csv"):
            os.chdir(allMsgPath)
            sample = pd.read_csv('allMsg.csv', index_col=0)
          else:
            os.chdir(preprocPath)
            simulations = pd.Series([f for f in os.listdir() if os.path.isfile(f)]).sort_values().reset_index(drop=True)[:]
            dflist = []
            for idx, item in enumerate(simulations):
              print('\r', item, end='')
              df = pd.read_csv(item, index_col=0, header=0)
              dflist.append(df)
            sample = pd.concat(dflist).reset_index(drop=True)
            os.chdir(allMsgPath)
            sample.to_csv('allMsg.csv')
            del dflist, df
            # print('\rSimulations pre-processed:', len(simulations))
            # print('All msg shape:', sample.shape)
          
          columns = []
          for column in sample.columns.values:
            if feat == 'feat1':
              if 'RSSI' in column:
                columns.append(column)
              elif 'distance' in column:
                columns.append(column)
            elif feat == 'feat2':
              if 'conformity' in column and '0' not in column:
                columns.append(column)
            elif feat == 'feat3':
              if 'RSSI' in column and '0' not in column:
                columns.append(column)
              elif 'distance' in column and '0' not in column:
                columns.append(column)
              elif 'conformity' in column and '0' not in column:
                columns.append(column)
          columns.append('attackerType')
        
          if label == 'multiclass':
            sample = sample[columns]
          elif label == 'binary':
            posLabel = 1
            sample = sample[columns]
            sample['attackerType'].loc[sample['attackerType'] != 0] = posLabel
          else:
            posLabel = int(label.split("_")[1])
            sample = sample[columns]
            sample = sample.loc[(sample['attackerType'] == 0) | (sample['attackerType'] == posLabel)]
          # select the data
          data = sample.iloc[:, 0:-1].values
          classes = sample.iloc[:, -1].values
          # label binarize one-hot style
          lb = preprocessing.LabelBinarizer()
          lb.fit(classes)
          if modelML in ['mlp', 'lstm'] and label == 'multiclass':
            classes = lb.transform(classes)
          elif modelML in ['mlp', 'lstm']:
            classes = lb.transform(classes)
            classes = MultiLabelBinarizer().fit_transform(classes)
          data_train, data_test, classes_train, classes_test = train_test_split(data, classes, train_size=0.8, test_size=0.2, random_state=1)
          if os.path.exists(modelPath+"/"+label) == False:
            os.mkdir(modelPath+"/"+label)
          clf = []
          if modelML == 'dt':
            clf = DecisionTreeClassifier()
          elif modelML == 'knn':
            clf = KNeighborsClassifier()
          elif modelML == 'rf':
            clf = RandomForestClassifier()
          elif modelML == 'mlp':
            # modelo rede neural
            layer1 = keras.layers.Input(shape=(data_train.shape[1],))
            layer2 = keras.layers.Dense(7, activation="relu")(layer1)
            layer3 = keras.layers.Dense(7, activation="relu")(layer2)
            output = keras.layers.Dense(classes_train.shape[1], activation="softmax")(layer3)
          elif modelML == 'lstm':
            # number of features per series
            if feat == 'feat1':
              serie = 2
            elif feat == 'feat2':
              serie = 1
            elif feat == 'feat3':
              serie = 3
            data_train=data_train.reshape(data_train.shape[0], int(data_train.shape[1]/serie), serie)
            data_test=data_test.reshape(data_test.shape[0], int(data_test.shape[1]/serie), serie)
            lstmunits = 32
            layer1 = keras.layers.Input(shape=(data_train.shape[1], data_train.shape[2]))
            layer2 = keras.layers.LSTM(lstmunits, return_sequences=True)(layer1)
            layer3 = keras.layers.LSTM(lstmunits)(layer2)
            output = keras.layers.Dense(classes_train.shape[1], activation="softmax")(layer3)

          if modelML in ['mlp', 'lstm']:
            clf = keras.Model(inputs=layer1, outputs=output, name=name)
            clf.compile(
                  loss=keras.losses.CategoricalCrossentropy(),
                  optimizer=keras.optimizers.Adam(),
                  metrics=[keras.metrics.Accuracy(),
                          keras.metrics.Recall(class_id=1)],
              )
            early_stopping = keras.callbacks.EarlyStopping(
                monitor="loss",
                patience=3,
                min_delta=1e-4,
                restore_best_weights=True
            )
            # check condition for training
            while (os.path.exists(modelPath+"/"+label+"/"+feat+'/saved_model.pb') == False) or (name not in timeRecord.index.values):
              print('training', name)
              start_time = time.time()
              clf.fit(
                  data_train,
                  classes_train,
                  epochs=200,
                  batch_size=1000,
                  callbacks=[early_stopping]
              )
              trainingTime = time.time() - start_time
              clf.save(modelPath+"/"+label+"/"+feat)
              
              dataTime = {}
              dataTime[name] = [trainingTime, np.nan]
              dfTime = pd.DataFrame.from_dict(dataTime, orient='index', columns=['training', 'testing'])
              timeRecord.loc[name] = dfTime.loc[name]
              os.chdir(resultsPath)
              timeRecord.to_csv('time.csv')
              
              
          elif modelML in ['dt', 'knn', 'rf', 'svm']:
            # check condition for training
            while (os.path.exists(modelPath+"/"+label+"/"+name+'fit.pkl') == False) or (name not in timeRecord.index.values):
              print('training', name)
              start_time = time.time()
              clf.fit(data_train, classes_train)
              trainingTime = time.time() - start_time
              os.chdir(modelPath+"/"+label)
              joblib.dump(clf, name+'fit.pkl')
              
              
              dataTime = {}
              dataTime[name] = [trainingTime, np.nan]
              dfTime = pd.DataFrame.from_dict(dataTime, orient='index', columns=['training', 'testing'])
              timeRecord.loc[name] = dfTime.loc[name]
              os.chdir(resultsPath)
              timeRecord.to_csv('time.csv')            
              
              

          os.chdir(resultsPath)
          timeRecord = pd.read_csv('time.csv', index_col=0)
          print('Training '+name+' done in', "{:.2f}".format(timeRecord['training'].loc[name])+' sec')

          # test
          os.chdir(modelPath+"/"+label)
          clf = []
          if modelML in ['dt', 'knn', 'rf', 'svm']:
            clf = joblib.load(name+'fit.pkl' , mmap_mode ='r')
            proba = []
            start_time = time.time()
            proba = clf.predict_proba(data_test)
            predictTime = time.time() - start_time
            clTest = classes_test
          elif modelML in ['mlp', 'lstm']:
            clf = keras.models.load_model(modelPath+"/"+label+"/"+feat)
            proba = []
            start_time = time.time()
            proba = clf.predict(data_test)
            predictTime = time.time() - start_time
            clTest = lb.inverse_transform(classes_test)
          
          # log time of testing
          timeRecord['testing'].loc[name] = predictTime
          os.chdir(resultsPath)
          timeRecord.to_csv('time.csv')
          print('Predict '+name+' done in ', "{:.2f}".format(predictTime)+' sec')
          print("-"*70)

          if label == 'multiclass':
            # Classification Report
            pred = lb.inverse_transform(proba)
          else:
            # Best threshold
            precision, recall, thresholds = precision_recall_curve(clTest, proba[:, 1], pos_label=posLabel)
            # convert to f score
            np.seterr(divide='ignore', invalid='ignore')
            fscore = (2 * precision * recall) / (precision + recall)
            np.nan_to_num(fscore, copy=False)
            # locate the index of the largest f score
            ix = np.argmax(fscore)
            print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
            print("-"*70)
            # Classification Report
            pred = np.where(np.array(proba[:, 1]) >= thresholds[ix], posLabel, 0)
          classlist = []
          for cl in lb.classes_:
            classlist.append('class '+str(int(cl)))
          print('Classification Report for '+name)
          print(classification_report(clTest,
                                      pred,
                                      target_names=classlist,
                                      digits=3,
                                      zero_division=0))
          print("-"*70)
          # Confusion matrix
          cm = confusion_matrix(clTest, pred, labels=lb.classes_)
          disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                        display_labels=lb.classes_)
          disp.plot()
          plt.title(name)
          plt.savefig(name+'.pdf')
          plt.show()
          print("-"*70)

          if label == 'multiclass':
            # ROC CURVES
            if modelML in ['dt', 'knn', 'rf', 'svm']:
              lbclasses_test = lb.transform(classes_test)
            elif modelML in ['mlp', 'lstm']:
              lbclasses_test = classes_test
            n_classes = lbclasses_test.shape[1]
            # Compute ROC curve and ROC area for each class
            fpr = dict()
            tpr = dict()
            roc_auc = dict()
            for i in range(n_classes):
                fpr[i], tpr[i], _ = roc_curve(lbclasses_test[:, i], proba[:, i])
                roc_auc[i] = auc(fpr[i], tpr[i])

            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in range(n_classes):
                mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr
            roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

            # Plot all ROC curves
            lw = 2
            plt.figure()
            # plt.plot(
            #     fpr["micro"],
            #     tpr["micro"],
            #     label="micro-average ROC curve (area = {0:0.2f})".format(roc_auc["micro"]),
            #     color="deeppink",
            #     linestyle=":",
            #     linewidth=4,
            # )

            plt.plot(
                fpr["macro"],
                tpr["macro"],
                label="Macro Avg (area = {0:0.2f})".format(roc_auc["macro"]),
                color="navy",
                linestyle=":",
                linewidth=4,
                alpha=0.5,
            )

            colors = cycle(["b", "g", "r", "c", "m", "y"])
            for i, color in zip(range(n_classes), colors):
                labelClasses = int(lb.classes_[i])

                plt.plot(
                    fpr[i],
                    tpr[i],
                    color=color,
                    lw=lw,
                    label="Class {0} (area = {1:0.2f})".format(labelClasses, roc_auc[i]),
                    alpha=0.5
                )

            plt.plot([0, 1], [0, 1], "k--", lw=lw, color="grey", alpha=0.2)
            plt.xlim([-0.02, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.legend(loc="lower right")
            plt.title('ROCcurve '+name)
            os.chdir(modelPath+"/"+label)
            plt.savefig('ROCcurve '+name+'.pdf')
            plt.show()
            print("-"*70)


            # PR CURVES
            # Compute PR curve and PR area for each class
            precision = dict()
            recall = dict()
            pr_auc = dict()
            # precision, recall, thresholds = precision_recall_curve(clTest, proba[:, 1], pos_label=posLabel)
            for i in range(n_classes):
                precision[i], recall[i], _ = precision_recall_curve(lbclasses_test[:, i], proba[:, i])
                pr_auc[i] = auc(recall[i], precision[i])

            # First aggregate all false positive rates
            all_precision = np.unique(np.concatenate([precision[i] for i in range(n_classes)]))

            # Then interpolate all pr curves at this points
            mean_recall = np.zeros_like(all_precision)
            for i in range(n_classes):
                mean_recall += np.interp(all_precision, precision[i], recall[i])

            # Finally average it and compute AUC
            mean_recall /= n_classes

            precision["macro"] = all_precision
            recall["macro"] = mean_recall
            pr_auc["macro"] = auc(recall["macro"], precision["macro"])

            # Plot all pr curves
            lw = 2
            plt.figure()
            # plt.plot(
            #     precision["micro"],
            #     recall["micro"],
            #     label="micro-average pr curve (area = {0:0.2f})".format(pr_auc["micro"]),
            #     color="deeppink",
            #     linestyle=":",
            #     linewidth=4,
            # )

            plt.plot(
                precision["macro"],
                recall["macro"],
                label="Macro Avg (area = {0:0.2f})".format(pr_auc["macro"]),
                color="navy",
                linestyle=":",
                linewidth=4,
                alpha=0.5,
            )

            colors = cycle(["b", "g", "r", "c", "m", "y"])
            for i, color in zip(range(n_classes), colors):
                labelClasses = int(lb.classes_[i])

                plt.plot(
                    precision[i],
                    recall[i],
                    color=color,
                    lw=lw,
                    label="Class {0} (area = {1:0.2f})".format(labelClasses, pr_auc[i]),
                    alpha=0.5
                )

            plt.xlim([0.15, 1.02])
            plt.ylim([-0.01, 1.05])
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.legend(loc="lower left")
            plt.title('PRcurve '+name)
            os.chdir(modelPath+"/"+label)
            plt.savefig('PRcurve '+name+'.pdf')
            plt.show()
            print("-"*70)

          else:
            # Precision-Recall curve
            PrecisionRecallDisplay.from_predictions(clTest, proba[:, 1], pos_label=posLabel)
            plt.title('PR curve '+name)
            no_skill = len(clTest[clTest==1]) / len(clTest)
            plt.plot([0,1], [no_skill,no_skill], linestyle='--', color="grey", label='No Skill')
            plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best threshold')
            plt.legend()
            plt.savefig('PRcurve '+name+'.pdf')
            plt.show()
            print("-"*70)
            # ROC curve
            RocCurveDisplay.from_predictions(clTest, proba[:, 1], pos_label=posLabel)
            plt.title('ROC curve '+name)
            plt.plot([0, 1], [0, 1], color="grey", lw=1, linestyle="--")
            plt.savefig('ROCcurve '+name+'.pdf')
            plt.show()
            print("-"*70)

          # save the results to a csv
          if label == 'multiclass':
            prScore = precision_score(clTest, pred, average='macro', zero_division=0)
            rcScore = recall_score(clTest, pred, average='macro', zero_division=0)
            f1Score = f1_score(clTest, pred, average='macro', zero_division=0)
            accScore = accuracy_score(clTest, pred)
          else:
            prScore = precision_score(clTest, pred, pos_label=posLabel, zero_division=0)
            rcScore = recall_score(clTest, pred, pos_label=posLabel, zero_division=0)
            f1Score = f1_score(clTest, pred, pos_label=posLabel, zero_division=0)
            accScore = accuracy_score(clTest, pred)
          dataPerformance = {}
          dataPerformance[name] = [prScore, rcScore, f1Score, accScore]

          dfPerformance = pd.DataFrame.from_dict(dataPerformance, orient='index', columns=['precision', 'recall', 'f1score', 'accuracy'])

          os.chdir(resultsPath)
          try:
            performance = pd.read_csv('performance.csv', index_col=0)
          except FileNotFoundError:
            dfPerformance.to_csv('performance.csv')
            performance = dfPerformance
          else:
            try:
              performance.loc[name] = dfPerformance.loc[name]
            except KeyError:
              performance = pd.concat([performance, dfPerformance])
            performance.to_csv('performance.csv')

print('performance.csv')
display(performance)
print("-"*70)