In [53]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Activation, Input, Dropout
from tensorflow.keras.initializers import glorot_uniform

In [6]:
words, vectors = [], []
with open("data/metadata_norm_sentence_length_6_256_v2.tsv", mode="r") as words_file, \
open("data/vectors_norm_sentence_length_6_256_v2.tsv", mode="r") as vectors_file:
    for word in words_file:
        if not word.strip():
            continue
        words.append(word.strip())
    for vector in vectors_file:
        if not vector.strip():
            continue
        vectors.append([float(vec) for vec in vector.strip().split()]) 

embedding = {word: vectors[idx] for idx, word in enumerate(words)}

In [4]:
UNK = '[UNK]'
sentence_length = 8
embedding_vectors_dim = 256

In [9]:
result = {}
samples = {}
filename = "data/multiclassification_dataset_new.csv"
with open(filename, mode="r") as file:
    for line in file:
        sample, class_ = line[:-1].split(";")
        class_ = "_".join(class_.split())
        if class_ not in result:
            result[class_] = []
            samples[class_] = []
        samples[class_].append(sample)
        sentence_vector = []
        words = sample.strip().split()[:sentence_length]
        for word in words:
            sentence_vector.append(embedding[word] if word in embedding else embedding[UNK])
        if len(sentence_vector) < sentence_length:
            for _ in range(sentence_length-len(sentence_vector)):
                sentence_vector.append(embedding_vectors_dim*[0]) 
        if words:    
            result[class_].append(sentence_vector)

In [10]:
result.keys()

dict_keys(['connect_to_inet', 'no_internet', 'finance', 'support', 'consult'])

In [15]:
def one_hot_encoder(Y, C):
    Y = np.eye(C, dtype=np.int)[Y.reshape(-1)]
    return Y

In [20]:
X, y = [], []
classes = list(result.keys())
for class_, samples in result.items():    
    for sample in samples:
        X.append(sample)
        y.append(classes.index(class_))

In [32]:
X = np.array(X)
y_oh = one_hot_encoder(np.array(y), 5)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y_oh, test_size=0.2)

In [35]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5588, 8, 256), (5588, 5), (1398, 8, 256), (1398, 5))

In [73]:
def ClassificationLSTMModel(input_shape):
    inputs = Input(input_shape)
    X = LSTM(units=128)(inputs)
    X = Dropout(rate=0.5)(X)
    X = Dense(units=5)(X)
    X = Activation('softmax')(X)
    model = Model(inputs=inputs, outputs=X)
    return model

In [74]:
model = ClassificationLSTMModel((sentence_length, embedding_vectors_dim))
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 8, 256)]          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 128)               197120    
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 645       
_________________________________________________________________
activation_5 (Activation)    (None, 5)                 0         
Total params: 197,765
Trainable params: 197,765
Non-trainable params: 0
_________________________________________________________________


In [75]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [76]:
model.fit(X_train, y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f9c2cf83fd0>

In [77]:
model.evaluate(X_test, y_test)



[0.4552132487297058, 0.9091559648513794]

In [78]:
y_test_pred = model.predict(X_test)

In [79]:
print(classification_report(np.argmax(y_test_pred, axis=1), np.argmax(y_test, axis=1)))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       124
           1       0.87      0.81      0.84        96
           2       0.93      0.94      0.94       374
           3       0.92      0.94      0.93       369
           4       0.91      0.88      0.90       435

    accuracy                           0.91      1398
   macro avg       0.89      0.89      0.89      1398
weighted avg       0.91      0.91      0.91      1398

