# Import the required libraries and files

In [10]:
import pandas as pd
import numpy as np
import fasttext
from scipy.stats import uniform
import pickle
import re
import os

# scikit-learn
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Dense

# Local Packages
import hats.config
import hats.utility as ut
from hats.data_preprocessing import Preprocessing
import hats.ml_model as ml
from hats.config import CONFIG

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

%reload_ext autoreload
%autoreload 2


In [2]:
home_data = pd.read_csv('../dataset/dataset.csv', sep=';')
translate_data = pd.read_csv('../dataset/translations_data.csv', sep=';')
sms_translations_data = pd.read_csv('../dataset/sms_translations.csv', sep=';')

stop_words = []
with open('../dataset/stop_words.txt') as f:
  stop_words = f.readlines()
  stop_words = [word.replace('\n', '') for word in stop_words]

# Fasttext Model Training 

In [13]:
data_proprocess: Preprocessing = Preprocessing(stop_words, sms_translations_data)
home_data_preprocessed = data_proprocess.preprocessing(home_data.copy())
data_proprocess.saveToCsv(home_data_preprocessed)

# Process the output file to remove double quotes ("")
!sed -i 's/"//g' ../output/comm_preprocessed.txt

In [14]:
# Train a fasttext model in supervised fashion
ft_model = ml.createFasttextModel(CONFIG.OUTPUT_DATASET_FILE)

In [15]:
# Testing the fasttext model
ft_model.get_nearest_neighbors('light')

[(0.9943791627883911, 'batti'),
 (0.9886794090270996, 'balab'),
 (0.974006175994873, 'lights'),
 (0.9539253115653992, 'balb'),
 (0.9465795755386353, 'blb'),
 (0.9458802938461304, 'btti'),
 (0.9401957988739014, 'bulb'),
 (0.7651400566101074, 'jla'),
 (0.6618971824645996, 'dark'),
 (0.650171160697937, '</s>')]

In [16]:
ft_model.predict('mai chahta hu ki tum geyser band kr do')

(('__label__geyser_off',), array([0.99944645]))

In [17]:
ft_model.get_sentence_vector('mai chahta hu ki tum geyser band kr do').shape

(150,)

# Create additional columns to preprocessed dataset

## 1. Create <i><b>sent_vec</b></i> column in main dataset for sentence vectors

In [18]:
home_data_vectorized = data_proprocess.convertCommandToVector(home_data_preprocessed, ft_model)
home_data_vectorized.head(10)

Unnamed: 0,commands,label,sent_vec
0,batti bujha,__label__light_off,"[0.09129511, -0.17886868, 0.013367534, -0.0127..."
1,balab bujha,__label__light_off,"[0.043475877, -0.17572588, -0.015810117, -0.02..."
2,balab band,__label__light_off,"[0.06676102, -0.18724361, 0.15825129, 0.110632..."
3,light band,__label__light_off,"[0.080121726, -0.20910013, 0.120552175, 0.0859..."
4,light bujha,__label__light_off,"[0.056836586, -0.19758242, -0.053509228, -0.04..."
5,light off,__label__light_off,"[0.070388824, -0.18570364, 0.093462825, 0.0803..."
6,dark,__label__light_off,"[0.11362806, -0.17577863, -0.0051939115, -0.04..."
7,off light,__label__light_off,"[0.070388824, -0.18570364, 0.093462825, 0.0803..."
8,off light,__label__light_off,"[0.070388824, -0.18570364, 0.093462825, 0.0803..."
9,batti bujhaw,__label__light_off,"[0.08482814, -0.13813066, -0.010202765, -0.037..."


## 2. Add a column for each class using OVR scheme

After adding the columns, create a single layer perceptron model with 150 inputs and 1 output with sigmoid activation. 

Total number of such models will be equal to the number of classes in the dataset. This is to train multiple models using the OVR technique and while predicting, we will use all the models to predict the final class label of the test command.

In [19]:
home_data_ovr = ut.add_class_ovr_cols(home_data_vectorized.copy())

# Tensorflow Model

In [11]:
models = ml.createPerceptronModels(home_data_ovr['label'].unique())

In [12]:
for m_name in models.keys():
    models[m_name].compile(optimizer=keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
for X_train, X_test, y_train, y_test, label in ut.data_split_classwise(home_data_ovr):
    history = models[label].fit(X_train, y_train, batch_size=20, epochs=50, validation_data=(X_test, y_test))
    ut.plot(history.history, 'Model ' + label, 'plot_' + label + '.png')
    break

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50

KeyboardInterrupt: 

# Scikit-learn models

Grid search using SVC

In [20]:
X_train, X_test, y_train, y_test = ut.data_split(home_data_ovr)

In [21]:
parameters = dict(C = [10e-5, 10e-4, 10e-3, 10e-2, 10e-1, 1, 10e1, 10e2, 10e4],\
    kernel=['linear', 'rbf', 'poly'])
svc_model = SVC(probability=True, random_state=40)

In [24]:
clf = GridSearchCV(estimator=svc_model, param_grid=parameters)
clf.fit(X_train, y_train)
pickle.dump(clf, open('../' + CONFIG.OUTPUT_DIRECTORY_NAME + CONFIG.SVM_MODEL_SAVEFILE, 'wb'))

In [25]:
clf.best_estimator_

SVC(C=0.1, kernel='linear', probability=True, random_state=40)

In [26]:
clf.score(X_train, y_train)

1.0

In [27]:
test_data = np.reshape(ft_model.get_sentence_vector('mai chahta hu ki tum geyser band kr do'), (1, -1))
print(clf.predict_proba(test_data))
print(clf.predict(test_data))

[[0.01886121 0.00526422 0.05359809 0.00883506 0.80203765 0.0244259
  0.03267593 0.007419   0.03890349 0.00797945]]
['__label__geyser_off']


In [28]:
clf.classes_

array(['__label__ac_off', '__label__ac_on', '__label__fan_off',
       '__label__fan_on', '__label__geyser_off', '__label__geyser_on',
       '__label__light_off', '__label__light_on', '__label__tv_off',
       '__label__tv_on'], dtype=object)

In [29]:
clf.classes_[np.argmax(clf.predict_proba(test_data)[0])]

'__label__geyser_off'

In [30]:
filename = '../' + CONFIG.OUTPUT_DATASET_FILE + CONFIG.SVM_MODEL_SAVEFILE
ml.predict('tu pahal hai bhai geyser', ft_model, filename)

'Other'