# Notebook for the experiment of building **DeCaf** (**De**sign **C**l**a**ssi**f**ier)

## Architectural Overview/Design
In Progress

## Objective
The main objective is to validate the model with various test data

In [11]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd

def log(log_file_path, message):
  file = pd.DataFrame([[str(dt.now()), 'info', message]], columns=['timestamp', 'type', 'message'])
  if not os.path.exists(log_file_path):
    file.to_csv(log_file_path)
  else:
    file.to_csv(log_file_path, mode='a', header=False)

def log_result(log_file_path, message):
  f = open(log_file_path, "a")
  f.write(str(dt.now()))
  f.write(message)
  f.close

### Validate the models
- We use Area Under the Receiver Operating Characteristic Curve (**ROC AUC**) from prediction scores as the validation criteria.

In [0]:
from sklearn import metrics

def auc_score(X, Y, model, process_number, auc):
  pred = model.predict(X)
  auc[process_number] = metrics.roc_auc_score(Y, pred)

### Load test data

In [0]:
import numpy as np

X_T = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/test_data/X_T.npy")
Y_T = np.load("/content/drive/My Drive/documents/projects/DeCaf/data/test_data/Y_T.npy")

#### Load the data mining models and calculate the **ROC AUC** score

In [0]:
  model_paths = [
      "/content/drive/My Drive/documents/projects/DeCaf/models/knn.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/dt.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/rf.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/lr.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/gnb.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/nn.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/ab.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/qda.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/lsvm.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/rbf_svm.joblib"
  ]
  classifier_names = [
      "Nearest Neighbors",
      "Decision Tree",
      "Random Forest",
      "Logistic Regression",
      "Gaussian Naive Bayes", 
      "Neural Net", 
      "AdaBoost",
      "QDA",    
      "Linear SVM", 
      "RBF SVM",
  ]

In [0]:
import multiprocessing as mp
from joblib import load
from datetime import datetime as dt

print("Available CPUs: ", mp.cpu_count())

test_data_names = ["SO test data"]
test_data = [X_T]
test_label = [Y_T]


for i, data in enumerate(test_data):
  print(str(dt.now()) + " Creating process for: ", test_data_names[i])
  manager = mp.Manager()
  auc = manager.dict()
  jobs = []
  for index, model_path in enumerate(model_paths):
    model = load(model_path)
    print(str(dt.now()) + " Validating from: ", classifier_names[index])
    p = mp.Process(target=auc_score, args=(test_data[i], test_label[i], model, index, auc))
    jobs.append(p)
    p.start()
  
  for proc in jobs:
    proc.join()
  
  aucs = auc.values()
  print(aucs)
  print("------------------------------------------------------------------------------")


In [38]:
print(auc)

{1: 0.7265999999999999, 3: 0.8298333333333333, 2: 0.8236333333333334, 4: 0.7815666666666667, 6: 0.7962333333333333, 7: 0.8016333333333332, 5: 0.8437, 9: 0.8468333333333333, 0: 0.7922333333333333, 8: 0.8116666666666666}


In [0]:
import multiprocessing as mp

print("Available CPUs: ", mp.cpu_count())

if not os.path.exists(result_path):

  table.field_names = classifier_names
  logs = []
  auc_scores = []
  for index, model_path in enumerate(model_paths):
    start_loading_time = dt.now()
    print(str(start_loading_time) + " Loading model: ", classifier_names[index])
    logs.append(str(start_loading_time) + " Loading model: " + classifier_names[index])
    model = load(model_path)
    end_loading_time = dt.now()
    print(str(end_loading_time) + " Finished loading model: ", classifier_names[index])
    logs.append(str(end_loading_time) + " Finished loading model: " + classifier_names[index])
    start_time = dt.now()
    print(str(start_time) + " Start calculating AUC ROC using: ", classifier_names[index])
    logs.append(str(start_time) + " Start calculating AUC ROC using: " + classifier_names[index])
    auc_scores.append(auc_score(X_T, Y_T, model))
    end_time = dt.now()
    print(str(end_time) + " Finished calculating AUC ROC using: ", classifier_names[index])
    logs.append(str(end_time) + " Finished calculating AUC ROC using: " + classifier_names[index])
    print("Calculation time: ", end_time - start_time)
    logs.append("Calculation time: " + end_time - start_time)
    print("--------------------------------------------------------------------------------")
    log("/content/drive/My Drive/documents/projects/DeCaf/logs/model_performance.csv", \
        "\n".join(logs))

  table.add_row(auc_scores)

  print(table)
  log_result(result_path, table.get_string())
else:
  print("Result persists at ", result_path)

## Cross dataset validation of the models
We are taking the following datasets to validate the models:
- Brunet 2014 (brunet2014.csv)
- Shakiba 2016 (shakiba2016.csv)
- Viviani 2018 (viviani2018.csv)
- Self Admitted Technical Debt/ SATD (satd.csv)
- Stack Overflow (so.csv)

In [0]:
cross_dataset_names = ["Brunet 2014", 
                       "Shakiba 2016", "Viviani 2018",
                       "SATD"]

### Vectorize and Save the data

In [0]:
X_brunet2014 = 
Y_brunet2014 = 
print(X_brunet2014.shape)
print(Y_brunet2014.shape)

(159, 300)
(159,)


In [0]:
X_shakiba2016 = 
Y_shakiba2016 = 
print(X_shakiba2016.shape)
print(Y_shakiba2016.shape)

In [0]:
X_viviani2018 = 
Y_viviani2018 = 
print(X_viviani2018.shape)
print(Y_viviani2018.shape)

In [0]:
X_satd = 
Y_satd = 
print(X_satd.shape)
print(Y_satd.shape)

In [0]:
X_so = 
Y_so = 
print(X_so.shape)
print(Y_so.shape)

### Validate with the Trained models

In [0]:
test_data = [X_brunet2014, 
             X_shakiba2016, X_viviani2018, X_satd]
test_label = [Y_brunet2014, 
              Y_shakiba2016, Y_viviani2018, Y_satd]

table = PrettyTable()
table.field_names = [" "] + cross_dataset_names

for index, model_path in enumerate(model_paths):
  logs = []
  print(str(dt.now()) + " Start loading model: ", classifier_names[index])
  logs.append(str(dt.now()) + " Start loading model: " + classifier_names[index])
  model = load(model_path)
  row = [classifier_names[index]]
  for i, data in enumerate(test_data):
    start_time = dt.now()
    print(str(start_time) + " Start evaluating ", cross_dataset_names[i])
    row.append(auc_score(data, test_label[i], model))
    end_time = dt.now()
    print(str(end_time) + " Finished evaluating ", cross_dataset_names[i])
    total_time = end_time - start_time
    logs.append("Evaluation time: " + str(total_time))
  log("/content/drive/My Drive/documents/projects/DeCaf/logs/cross_data_evaluation.csv", \
      "\n".join(logs))
  print(row)
  table.add_row(row)

print(table)
log_result("/content/drive/My Drive/documents/projects/DeCaf/results/cross_data.txt", table.get_string())

2020-04-05 05:49:26.042347 Start loading model:  Decision Tree
2020-04-05 05:49:26.734873 Start evaluating  Brunet 2014
2020-04-05 05:49:26.737550 Finished evaluating  Brunet 2014
['Decision Tree', 0.4989963198394112]
2020-04-05 05:49:26.743836 Start loading model:  Random Forest
2020-04-05 05:49:29.653865 Start evaluating  Brunet 2014
2020-04-05 05:49:29.758631 Finished evaluating  Brunet 2014
['Random Forest', 0.49489795918367346]
+---------------+---------------------+
|               |     Brunet 2014     |
+---------------+---------------------+
| Decision Tree |  0.4989963198394112 |
| Random Forest | 0.49489795918367346 |
+---------------+---------------------+
