In [1]:
import fasttext
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Preprocessing (to be done only once)

Do some preprocessing and save the preprocessed data into a separate file. Need to execute only once.

In [2]:
col_names = ['command', 'label']
df = pd.read_csv('./Utilities/Datasets/home_appliance_commands.csv', header=None, names=col_names)

# Convert all sentences of command into lowercase
commands = [item.lower() for item in df['command']]
df['command'] = commands

# Save to a file
df.to_csv('./Utilities/Datasets/Preprocessed/home_appliance.csv', index=False)

<b>Remove the stop words from the dataset</b>

In [3]:
# Function to remove the stop words from a string
def remove_stop_words(string, stop_words):
    new_string = ''
    for word in string.split():
        if word not in stop_words:
            new_string += word + ' '
            
    return new_string.strip()

In [4]:
# Read the stop words from the file ./Utilities/Preprocessing Utilities/stop_words.txt
with open('./Utilities/Preprocessing Utilities/stop_words.txt', 'r') as f:
    stop_words = f.readlines()
    stop_words = [word.replace('\n', '') for word in stop_words]
    
# Remove the stop words from the dataset
df = pd.read_csv('./Utilities/Datasets/Preprocessed/home_appliance.csv')
df['command'] = df['command'].apply(lambda x: remove_stop_words(x, stop_words))

# Save the DataFrame to a file
df.to_csv('./Utilities/Datasets/Preprocessed/home_appliance.csv', index=False)

# Word vector learning by FastTex

In [5]:
ft_model = fasttext.train_unsupervised('./Utilities/Datasets/Preprocessed/home_appliance.csv', dim=100)

Save the word vectors in a file to retrieve them later

In [6]:
# Save the model into a binary file (Folder should be already present)
ft_model.save_model('./Result/home_appliance1.bin')

### Retrieve the learned vectors

In [7]:
ft_model = fasttext.load_model('./Result/home_appliance1.bin')



# Get the dictionary data for translation

In [8]:
trans_dict = pd.read_csv('./Utilities/Translations/translations_data.csv')
trans_dict.head()

Unnamed: 0,light,geyser,off,on,water,hot
0,batti,heater,band,chalu,paani,garam
1,bulb,machiniya,bujha,jalaiye,,
2,balab,machine,dark,jalaw,,
3,,,bujhaw,kara,,
4,,,bujhaiye,jala,,


### Create a custom dictionary to hold the vectors learnt by fasttext

In [9]:
word_vecs = dict((word, ft_model.get_word_vector(word)) for word in ft_model.get_words())

In [10]:
# Function to replace the word vectors
def replace_word_vec(main_dict, word_list, root_word):
    for word in word_list:
        main_dict[word] = ft_model.get_word_vector(root_word)

In [11]:
# Replace the word vectors for the words in the translation dictionary to get perfect similarity
for indx, col in trans_dict.iteritems():
    replace_word_vec(word_vecs, col, indx)
    
# Add the root words to the custom word_vector dictionary
for word in trans_dict.columns:
    word_vecs[word] = ft_model.get_word_vector(word)

<b>Save the new dictionary into a file</b>

In [12]:
import _pickle as pickle

wordVecDict = {'wordVecDict': word_vecs}

with open('./Utilities/Result Vectors/word_vectors.bin', 'wb') as f:
    f.write(pickle.dumps(wordVecDict))

<b>Read the word vectors from the file</b>

In [13]:
with open('./Utilities/Result Vectors/word_vectors.bin', 'rb') as f:
    word_vecs = pickle.load(f)
    word_vecs = word_vecs['wordVecDict']

# Get sentence vectors by simple averaging of word vectors

Since there is no api in the fastText library to calculate the sentence vectors, we will do a custom implementation of the same by averaging the word vectors 

In [14]:
def get_l2_norm(word_vec):
    return np.sqrt(np.sum(word_vec ** 2))

def get_l2_normed_vector(word_vec):
    l2_norm = get_l2_norm(word_vec)
    if l2_norm > 0:
        return word_vec * (1.0 / l2_norm)
    else:
        return word_vec

def get_sentence_vector(sentence):
    sentence_vector = np.zeros(100)  # because the word vectors are 100-dimentional
    for word in sentence.split():
        word_vec = word_vecs[word]
        sentence_vector = np.add(sentence_vector, get_l2_normed_vector(word_vec))
        
    return np.divide(sentence_vector, len(sentence.split()))

In [15]:
import _pickle as pickle

# Read in the preprocessed dataset
df = pd.read_csv('./Utilities/Datasets/Preprocessed/home_appliance.csv')

# Get sentence vectors for each command and add them to a dictionary
sentence_vector = dict()
for sent in df['command']:
    sentence_vector[sent] = get_sentence_vector(sent)

<b> Save the sentence vector dictionary to a file</b>

In [16]:
with open('./Utilities/Result Vectors/sentence_vectors.bin', 'wb') as f:
    sentDict = {'sentDict': sentence_vector}
    f.write(pickle.dumps(sentDict))

<b> Retrieve the sentence vectors </b>

In [17]:
with open('./Utilities/Result Vectors/sentence_vectors.bin', 'rb') as f:
    sentence_vector = pickle.load(f)
    sentence_vector = sentence_vector['sentDict']

# Apply KNN

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=4)

In [19]:
df['sent_vec'] = df['command'].apply(lambda sent: sentence_vector[sent])

<b> Use LabelEncoder to transform the labels to numericalvalues </b>

In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'].tolist())

<b> Split the dataset </b>

In [21]:
X = df[df.columns[-2]]
y = df[df.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

<b> Train the KNN Classifier </b>

In [23]:
knn_clf.fit(X_train.tolist(), y_train.tolist())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

<b> Predict the results for the test data </b>

In [24]:
predictions = knn_clf.predict(X_test.tolist())

<b> Get the results </b>

In [25]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [26]:
print(classification_report(y_test.tolist(), predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         2

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



In [27]:
print(accuracy_score(y_test.tolist(), predictions))

1.0


# Apply KMeans

In [28]:
from sklearn.cluster import KMeans

km_clr = KMeans(n_clusters=4)

In [29]:
km_clr.fit(X_train.tolist())

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [30]:
predictions_km = km_clr.predict(X_test.tolist())

In [31]:
predictions_km

array([3, 2, 2, 3, 1, 2, 0, 1, 2], dtype=int32)