# Notebook for the experiment of building **DeCaf** (**De**sign **C**l**a**ssi**f**ier)

## Architectural Overview/Design
In Progress

## Objective
The main objective is to validate the model with various test data

In [17]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# All the imports
import pandas as pd
from sklearn import metrics
import numpy as np
import multiprocessing as mp
from joblib import load
from datetime import datetime as dt
from prettytable import PrettyTable
import os

In [0]:
# Control Parameters
dm_test_validation = False
dm_cross_validation = False

In [0]:
def log(log_file_path, message):
  file = pd.DataFrame([[str(dt.now()), 'info', message]], columns=['timestamp', 'type', 'message'])
  if not os.path.exists(log_file_path):
    file.to_csv(log_file_path)
  else:
    file.to_csv(log_file_path, mode='a', header=False)

def log_result(log_file_path, message):
  f = open(log_file_path, "a")
  f.write(str(dt.now()))
  f.write(message)
  f.close

In [0]:
def auc_score(X, Y, model, process_number, auc):
  pred = model.predict(X)
  auc[process_number] = metrics.roc_auc_score(Y, pred)

## Load the data mining models and calculate the **ROC AUC** score

In [0]:
  model_paths = [
      "/content/drive/My Drive/documents/projects/DeCaf/models/knn.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/dt.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/rf.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/lr.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/gnb.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/nn.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/ab.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/qda.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/lsvm.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/rbf_svm.joblib"
  ]
  classifier_names = [
      "Nearest Neighbors",
      "Decision Tree",
      "Random Forest",
      "Logistic Regression",
      "Gaussian Naive Bayes", 
      "Neural Net", 
      "AdaBoost",
      "QDA",    
      "Linear SVM", 
      "RBF SVM",
  ]

In [0]:
def validate_dm_models(test_data_names, test_data, test_labels, logs):

  table = PrettyTable()
  table.field_names = ['Dataset'] + classifier_names
  performance_log = []
  logs.append("Validated at: " + str(dt.now()))
  for i, data in enumerate(test_data):
    aucs = []
    print(str(dt.now()) + " Creating process for: ", test_data_names[i])
    # logs.append(str(dt.now()) + " Creating process for: " + test_data_names[i])
    manager = mp.Manager()
    auc = manager.dict()
    jobs = []
    for index, model_path in enumerate(model_paths):
      model = load(model_path)
      print(str(dt.now()) + " Validating from: ", classifier_names[index])
      # logs.append(str(dt.now()) + " Validating from: " + classifier_names[index])
      p = mp.Process(target=auc_score, args=(test_data[i], \
                                             test_labels[i], model, index, auc))
      jobs.append(p)
      p.start()
    
    for proc in jobs:
      proc.join()
    
    for index, name in enumerate(classifier_names):
      aucs.append(round(auc[index], 4))

    table.add_row([test_data_names[i]] + aucs)
  
  print(table)
  logs.append(table.get_string())

## Validate the models
- We use Area Under the Receiver Operating Characteristic Curve (**ROC AUC**) from prediction scores as the validation criteria.

### Load test data

In [0]:
X_T = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/test_data/X_T.npy")
Y_T = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/test_data/Y_T.npy")

### Validate the trained models

In [25]:
result_path = "/content/drive/My Drive/documents/projects/DeCaf/results/test_data.txt"

if os.path.exists(result_path) and not dm_test_validation:
  with open(result_path, "r") as f:
    print(f.read())
else:
  logs = []
  test_data_names = ["SO test data"]
  test_data = [X_T]
  test_labels = [Y_T]
  validate_dm_models(test_data_names, test_data, test_labels, logs)
  with open(result_path, "a+") as f:
    f.write("\n".join(logs) + "\n")

2020-04-07 03:13:30.517857 Creating process for: SO test data
2020-04-07 03:13:35.993498 Validating from: Nearest Neighbors
2020-04-07 03:13:36.050743 Validating from: Decision Tree
2020-04-07 03:13:38.297640 Validating from: Random Forest
2020-04-07 03:13:38.702103 Validating from: Logistic Regression
2020-04-07 03:13:38.728904 Validating from: Gaussian Naive Bayes
2020-04-07 03:13:39.422491 Validating from: Neural Net
2020-04-07 03:13:40.053497 Validating from: AdaBoost
2020-04-07 03:13:40.826526 Validating from: QDA
2020-04-07 03:13:45.228753 Validating from: Linear SVM
2020-04-07 03:13:48.762703 Validating from: RBF SVM
+--------------+-------------------+---------------+---------------+---------------------+----------------------+------------+----------+--------+------------+---------+
|   Dataset    | Nearest Neighbors | Decision Tree | Random Forest | Logistic Regression | Gaussian Naive Bayes | Neural Net | AdaBoost |  QDA   | Linear SVM | RBF SVM |
+--------------+------------

## Cross dataset validation of the models
We are taking the following datasets to validate the models:
- Brunet 2014 (brunet2014.csv)
- Shakiba 2016 (shakiba2016.csv)
- Viviani 2018 (viviani2018.csv)
- Self Admitted Technical Debt/ SATD (satd.csv)
- Stack Overflow (so.csv)

In [0]:
cross_dataset_names = ["Brunet 2014", 
                       "Shakiba 2016", "Viviani 2018",
                       "SATD"]

### Load the data Vectors

In [27]:
X_brunet2014 = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/X_brunet2014.npy")
Y_brunet2014 = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/Y_brunet2014.npy")
print(X_brunet2014.shape)
print(Y_brunet2014.shape)

(159, 200)
(159,)


In [28]:
X_shakiba2016 = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/X_shakiba2016.npy")
Y_shakiba2016 = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/Y_shakiba2016.npy")
print(X_shakiba2016.shape)
print(Y_shakiba2016.shape)

(67, 200)
(67,)


In [29]:
X_viviani2018 = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/X_viviani2018.npy")
Y_viviani2018 = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/Y_viviani2018.npy")
print(X_viviani2018.shape)
print(Y_viviani2018.shape)

(1969, 200)
(1969,)


In [30]:
X_satd = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/X_satd.npy")
Y_satd = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/cross_data/so_vocab_injected/Y_satd.npy")
print(X_satd.shape)
print(Y_satd.shape)

(2609, 200)
(2609,)


In [0]:
# X_so = 
# Y_so = 
# print(X_so.shape)
# print(Y_so.shape)

### Validate with the Trained models

In [32]:
result_path = "/content/drive/My Drive/documents/projects/DeCaf/results/cross_data.txt"

if os.path.exists(result_path) and not dm_cross_validation:
  with open(result_path, "r") as f:
    print(f.read())
else:
  logs = []
  test_data_names = ["SO test data"]
  test_data = [X_brunet2014, 
             X_shakiba2016, X_viviani2018, X_satd]
  test_labels = [Y_brunet2014, 
                Y_shakiba2016, Y_viviani2018, Y_satd]
  validate_dm_models(cross_dataset_names, test_data, test_labels, logs)
  with open(result_path, "a+") as f:
    f.write("\n".join(logs) + "\n")


2020-04-07 17:56:56.396232 Creating process for: Brunet 2014
2020-04-07 17:57:05.542378 Validating from: Nearest Neighbors
2020-04-07 17:57:06.291415 Validating from: Decision Tree
2020-04-07 17:57:10.585382 Validating from: Random Forest
2020-04-07 17:57:11.413486 Validating from: Logistic Regression
2020-04-07 17:57:12.264672 Validating from: Gaussian Naive Bayes
2020-04-07 17:57:13.153127 Validating from: Neural Net
2020-04-07 17:57:13.843486 Validating from: AdaBoost
2020-04-07 17:57:14.550347 Validating from: QDA
2020-04-07 17:57:18.726873 Validating from: Linear SVM
2020-04-07 17:57:22.198352 Validating from: RBF SVM
2020-04-07 17:57:28.306336 Creating process for: Shakiba 2016
2020-04-07 17:57:29.943071 Validating from: Nearest Neighbors
2020-04-07 17:57:29.990966 Validating from: Decision Tree
2020-04-07 17:57:32.028523 Validating from: Random Forest
2020-04-07 17:57:32.505713 Validating from: Logistic Regression
2020-04-07 17:57:32.533431 Validating from: Gaussian Naive Bayes
