# Notebook for the experiment of building **DeCaf** (**De**sign **C**l**a**ssi**f**ier)

## Architectural Overview/Design
In Progress

## Objective
The main objective is to train the models with the train data.

In [0]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd

def log(log_file_path, message):
  file = pd.DataFrame([[str(dt.now()), 'info', message]], columns=['timestamp', 'type', 'message'])
  if not os.path.exists(log_file_path):
    file.to_csv(log_file_path)
  else:
    file.to_csv(log_file_path, mode='a', header=False)

def log_result(log_file_path, message):
  f = open(log_file_path, "a")
  f.write(str(dt.now()))
  f.write(message)
  f.close

## Load training data

In [0]:
import numpy as np

X_Q_url = "/content/drive/My Drive/documents/projects/DeCaf/data/train_data/X_Q.npy"
Y_Q_url = "/content/drive/My Drive/documents/projects/DeCaf/data/train_data/Y_Q.npy"

X_A_url = "/content/drive/My Drive/documents/projects/DeCaf/data/train_data/X_A.npy"
Y_A_url = "/content/drive/My Drive/documents/projects/DeCaf/data/train_data/Y_A.npy"

X_C_url = "/content/drive/My Drive/documents/projects/DeCaf/data/train_data/X_C.npy"
Y_C_url = "/content/drive/My Drive/documents/projects/DeCaf/data/train_data/Y_C.npy"

In [0]:
X_Q = np.load(X_Q_url)
Y_Q = np.load(Y_Q_url)

X_A = np.load(X_A_url)
Y_A = np.load(Y_A_url)

X_C = np.load(X_C_url)
Y_C = np.load(Y_C_url)

X = np.concatenate((X_Q, X_A, X_C), axis=0)
Y = np.concatenate((Y_Q, Y_A, Y_C), axis=0)

### Examine X and Y

In [0]:
print(X.shape)
print(Y.shape)

(200000, 300)
(200000,)


## Training Traditional Data Mining Algorithms
- Before start experimenting with deep learning, we start our experiment with training some traditional data mining algorithms. We are taking the following classifiers with notations and parameter configurations:
  - k-Nearest Neighbors (`knn`)
  - Decision Tree (`dt`)
  - Random Forest (`rf`)
  - Logistic Regression (`lr`)
  - Linear SVM (`lsvm`)
    - C = Regularization Parameter
  - RBF SVM (`rbf_svm`)
    - Kernel coefficient = 2 
    - Regularization parameter = default = 1
  - Neural Net (`nn`)
  - AdaBoost (`ab`)
  - Naive Bayes (`gnb`)
  - QDA (`qda`)


### Libraries

In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

### Initialization

In [0]:

classifier_names = [
      "Nearest Neighbors",
      "Decision Tree",
      "Random Forest",
      "Logistic Regression",
      "Gaussian Naive Bayes", 
      "Neural Net", 
      "AdaBoost",
      "QDA",    
      "Linear SVM", 
      "RBF SVM",
]

model_paths = [
      "/content/drive/My Drive/documents/projects/DeCaf/models/knn.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/dt.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/rf.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/lr.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/gnb.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/nn.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/ab.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/qda.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/lsvm.joblib",
      "/content/drive/My Drive/documents/projects/DeCaf/models/rbf_svm.joblib"
]

classifiers = [
      KNeighborsClassifier(n_jobs=-1),
      DecisionTreeClassifier(), 
      RandomForestClassifier(n_jobs=-1), 
      LogisticRegression(max_iter=50000),
      GaussianNB(),
      MLPClassifier(max_iter=4000),
      AdaBoostClassifier(),
      QuadraticDiscriminantAnalysis(),
      SVC(kernel="linear", C=0.025), 
      SVC(gamma=2, C=1)
]

### Combine the three train data namely: `questions`, `answers` and `comment`

In [0]:
X = np.concatenate((X_Q, X_A, X_C), axis=0)
Y = np.concatenate((Y_Q, Y_A, Y_C), axis=0)

### Examine X and Y

In [0]:
print(X.shape)
print(Y.shape)

(200000, 300)
(200000,)


### **Train** and **Save** the models into memory

In [0]:
from joblib import dump, load
import os
from datetime import datetime as dt

def train_dm_models(X, Y, process_number, classifier, classifier_name, model_path):
  logs = []
  if not os.path.exists(model_path):
    start_time = dt.now()
    print(str(start_time) + " Started training model: ", classifier_name)
    logs.append(str(start_time) + " Started training model: " + classifier_name)
    model = classifier.fit(X, Y)
    end_time = dt.now()
    print(str(end_time) + " Finished training model: ", classifier_name)
    logs.append(str(end_time) + " Finished training model: " + classifier_name)
    print("Time to train model: " + classifier_name + ": " + str(end_time - start_time))
    logs.append("Time to train model: " + classifier_name + ": " + str(end_time - start_time))
    print("----------------------------------------------------------")
    dump(model, model_path)
    log("/content/drive/My Drive/documents/projects/DeCaf/logs/model_train.csv", "\n".join(logs))
  else: 
    print("Model already present at ", model_path)

In [0]:
import multiprocessing as mp

print("Available CPUs: ", mp.cpu_count())

jobs = []
for index, classifier in enumerate(classifiers):
  p = mp.Process(target=train_dm_models, args=(X, Y, index, classifier, classifier_names[index], model_paths[index]))
  jobs.append(p)
  p.start()

for proc in jobs:
  proc.join()

Available CPUs:  4
2020-04-05 11:14:13.383082 Started training model:  Nearest Neighbors
2020-04-05 11:14:13.455925 Started training model:  Decision Tree
2020-04-05 11:14:13.529344 Started training model:  Random Forest
2020-04-05 11:14:13.607297 Started training model:  Logistic Regression
2020-04-05 11:14:13.713409 Started training model:  Gaussian Naive Bayes
2020-04-05 11:14:13.786545 Started training model:  Neural Net
2020-04-05 11:14:13.909944 Started training model:  AdaBoost
2020-04-05 11:14:13.985187 Started training model:  QDA
2020-04-05 11:14:14.117301 Started training model:  Linear SVM
2020-04-05 11:14:14.236641 Started training model:  RBF SVM
2020-04-05 11:14:22.209939 Finished training model:  Gaussian Naive Bayes
Time to train model: Gaussian Naive Bayes: 0:00:08.496530
----------------------------------------------------------
2020-04-05 11:16:05.566355 Finished training model:  Logistic Regression
Time to train model: Logistic Regression: 0:01:51.959058
----------



2020-04-05 11:21:37.987047 Finished training model:  Neural Net
Time to train model: Neural Net: 0:07:24.200502


Process Process-145:


----------------------------------------------------------


Traceback (most recent call last):
KeyboardInterrupt
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-68-de3e54a1af8a>", line 11, in train_dm_models
    model = classifier.fit(X, Y)
  File "/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/_forest.py", line 383, in fit
    for i, t in enumerate(trees))
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 1017, in __call__
    self.retrieve()
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 909, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 638, in get
    self.wait(timeout)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 635, in wait
    self._event.wait(timeout)
  File "/usr/lib/python3.6/threading.py", line 551, i

KeyboardInterrupt: ignored