## <span style="text-decoration : underline">Import packages and datas<span>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import sys, os, json
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
dbs_path = f'{module_path}/query_dbs/'
models_path = f'{module_path}/models/'
graphs_path =  f'{module_path}/graphs/'

In [3]:
from utils.classif_utils import flatten, multi_label
list_files = os.listdir(dbs_path)
list_query_type = [file_name.replace('.json','') for file_name in list_files]

In [None]:
datasets = []
for index, query_type in enumerate(list_query_type):
    if index == 3:
        pre_data = json.load(open(dbs_path + query_type + '.json', encoding = 'latin-1'))
    else:
        pre_data = json.load(open(dbs_path +query_type + '.json'))
    datasets.append(pre_data)

new = {list(query_type_dico.keys())[0] : list(query_type_dico.values())[0]  for query_type_dico in datasets}
new = [ {'query_type' : query_type, 'text_full' : flatten(dict_data)} for query_type, data_query_type in new.items()
       for dict_data in data_query_type  ]
data_label = pd.DataFrame(new)

In [None]:
queries = {query_type : index for index, query_type in enumerate(list_query_type)}
data_label['label'] = data_label['query_type'].map(queries) 
data_label.head()

#### Multilabel target

In [None]:
data_label["label_multi"] = np.nan
data_label["label_multi"] = data_label["label"].apply(multi_label)
data_label.dropna(inplace = True)

### Deep learning model

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,SpatialDropout1D,LSTM,Dense, Dropout
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import confusion_matrix

In [None]:
nb_words_max = 50000
max_seq_length = 40
embedding_dim = 100

In [None]:
tokenizer = Tokenizer(num_words=nb_words_max, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data_label['text_full'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(data_label['text_full'].values)
X = pad_sequences(X, maxlen=max_seq_length)
print('Shape of data tensor:', X.shape)

Y = np.vstack(data_label['label_multi'].values)
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)

In [None]:
# saving tokenizer
with open(f'{models_path}tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model = Sequential()
model.add(Embedding(nb_words_max, embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(10, dropout=0.3, recurrent_dropout=0.2))
model.add(Dropout(0.3))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr = 0.0005), metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, Y_train, epochs=10, batch_size= 16, validation_data = (X_test,Y_test))

In [None]:
model.save(f"{models_path}lstm_classif_type.h5")

In [None]:
plt.style.use("seaborn-darkgrid")
plt.subplot(121)
plt.plot(history.history["loss"], label = "Loss")
plt.plot(history.history["val_loss"], label = "Validation loss")
plt.xlabel("Number of epochs")
plt.ylabel("Cross entropy")
plt.legend()

plt.subplot(122)
plt.plot(history.history["acc"], label = "Accuracy")
plt.plot(history.history["val_acc"], label = "Validation accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Number of epochs")
plt.legend()
plt.tight_layout()

plt.savefig(f'{graphs_path}loss_accuracy.png')

In [None]:
import itertools
from random import *
from sklearn.metrics import accuracy_score

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
pred = model.predict(X_test)
n = len(X_test)

lab_pred = np.zeros(n)
lab_true = np.zeros(n)

for i in range(n):
    lab_pred[i] = np.argmax(pred[i])
    lab_true[i] = np.argmax(Y_test[i])

plot_confusion_matrix(confusion_matrix(lab_true,lab_pred), classes = np.arange(7))

In [None]:
def multi_lab_to_query_type(lab):
    return list_of_query_type[np.argmax(lab)]

multi_lab_to_query_type_vect = np.vectorize(multi_lab_to_query_type)

def predict(query_batch):
    # loading tokenizer
    with open(f"{models_path}tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    X = tokenizer.texts_to_sequences(query_batch)
    X = pad_sequences(X, maxlen = 40)
    
    model = load_model(f"{models_path}lstm_classif_type.h5')
    pred = model.predict(X)
    print(pred)
    
    n = len(X)
    lab_pred = np.zeros(n)
    
    for i in range(n):
        lab_pred[i] = np.argmax(pred[i])
    
    return lab_pred

In [None]:
test = ["Francis petite frappe", "I want to listen to techno music", "Rate Harry Potter movie", 
        "What will the weather be like tomorow mother fucker ?", 'I want to book a table for six']
test = np.array(test)
predict(test)

In [None]:
list_query_type