# LSTM Sequence Classification
Reference: https://towardsdatascience.com/word-bags-vs-word-sequences-for-text-classification-e0222c21d2ec

Import Libraries:

In [2]:
import numpy as np
import os
import time
import pandas as pd
import json
import re
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
import random as rn
import keras
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

startTime = time.time()
print("Tensor Flow Version: ",tf.__version__)

Tensor Flow Version:  2.2.0-rc2


In [3]:
#All this for reproducibility
np.random.seed(1)
rn.seed(1)
tf.random.set_seed(1)

In [4]:
# Build the corpus and sequences
labelToName = { 0 : 'Rejected', 1 : 'Allowed' }
namesInLabelOrder = ['Rejected', 'Allowed']
os.chdir('D:\\PhD\\Dataset')
df=pd.read_csv('CriminalBailApplication.csv')
X=df.text.tolist()
labels=df.loc[:,['label']]


def preprocess(temp):
    temp=re.sub("\s\s+"," ",temp)    #Replacing multiple spaces with one
    temp=temp.replace(u'\xa0', ' ').encode('utf-8')  #Replacing non-breaking space    
    temp=temp.decode().split(' ')    #Converting a string into list of words seperated by space charcter
    return temp

#print(X[1])
X = [preprocess(t) for t in X]  
#print("\n\n\n After Preprocess:\n\n",X[1])


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(X, labels)
train_indices, test_indices = next(sss)


def seqlengths(l):
    if isinstance(l,list):
        yield len(l)
        for y in l:
            yield from seqlengths(y)

maxsequenceLength=max(seqlengths(X))
print("Max sequence length:",maxsequenceLength)


Max sequence length: 1502


In [5]:
# Encode the documents
kTokenizer = keras.preprocessing.text.Tokenizer() 
kTokenizer.fit_on_texts(X)
Xencoded = np.array([np.array(xi) for xi in kTokenizer.texts_to_sequences(X)])                

print("Shape Before Padding:",np.shape(Xencoded[22]))
Xencoded=keras.preprocessing.sequence.pad_sequences(Xencoded, maxlen=maxsequenceLength, truncating='post')
print("\nShape After Padding:",np.shape(Xencoded[22]))

labels = np.array(labels)

nWords=len(kTokenizer.word_index)
print("\n nWords:",nWords)

Shape Before Padding: (782,)

Shape After Padding: (1502,)

 nWords: 8793


In [6]:
# Build the LSTM model
def getModel():
    units1, units2 = int (nWords/4), int (nWords/8)
    model = keras.models.Sequential()
    model.add(keras.layers.embeddings.Embedding(input_dim = nWords+1,output_dim=units1,input_length=maxsequenceLength, trainable=True))               
    model.add(keras.layers.LSTM(units = units2, return_sequences =False))                              
    model.add(keras.layers.Dense(len(labelToName), activation ='softmax'))                           
    model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['acc'])
    return model

In [7]:
train_x = Xencoded[train_indices]
test_x = Xencoded[test_indices]
train_labels = keras.utils.to_categorical(labels[train_indices], len(labelToName))
test_labels = keras.utils.to_categorical(labels[test_indices], len(labelToName))

In [8]:
# Train and test over multiple train/validation sets
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=2, mode='auto', restore_best_weights=False)                         
sss2 = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=1).split(train_x, train_labels)          
for i in range(1):
    train_indices_2, val_indices = next(sss2)
    model = getModel()
    model.summary()
    history=model.fit(x=train_x[train_indices_2], y=train_labels[train_indices_2], epochs=50, batch_size=32, shuffle=True, validation_data = (train_x[val_indices], train_labels[val_indices]), verbose=2, callbacks=[early_stop], use_multiprocessing=True)
    test_loss, test_accuracy = model.evaluate(test_x, test_labels, verbose=2)                                
    print ("\ntest_loss:",test_loss, "\ntest_accuracy:",test_accuracy)
    predicted = model.predict(test_x, verbose=2)
    predicted_labels = predicted.argmax(axis=1)        
    print ("\n\nConfusion Matrix:\n",confusion_matrix(labels[test_indices], predicted_labels))
    print ("\n\nClassification Report:\n\n",classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder))
    

endTime = time.time()
hours, rem = divmod(endTime-startTime, 3600)
minutes, seconds = divmod(rem, 60)
print("\n\nTotal time taken: ")
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1502, 2198)        19329212  
_________________________________________________________________
lstm_1 (LSTM)                (None, 1099)              14498008  
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 2200      
Total params: 33,829,420
Trainable params: 33,829,420
Non-trainable params: 0
_________________________________________________________________




Train on 210 samples, validate on 53 samples
Epoch 1/50
 - 1012s - loss: 0.8624 - acc: 0.7571 - val_loss: 0.4978 - val_acc: 0.7358
Epoch 2/50
 - 1101s - loss: 0.3049 - acc: 0.9000 - val_loss: 0.3842 - val_acc: 0.7925
Epoch 3/50
 - 1101s - loss: 0.0999 - acc: 0.9762 - val_loss: 0.3372 - val_acc: 0.8679
Epoch 4/50
 - 1089s - loss: 0.0209 - acc: 0.9905 - val_loss: 0.4003 - val_acc: 0.8113
Epoch 5/50
 - 1119s - loss: 0.0068 - acc: 1.0000 - val_loss: 0.4652 - val_acc: 0.7925
Epoch 6/50
 - 1058s - loss: 0.0018 - acc: 1.0000 - val_loss: 0.6123 - val_acc: 0.7925
Epoch 7/50
 - 1091s - loss: 5.3524e-04 - acc: 1.0000 - val_loss: 0.7244 - val_acc: 0.7925
Epoch 8/50
 - 1099s - loss: 0.0015 - acc: 1.0000 - val_loss: 0.6938 - val_acc: 0.7736
Epoch 00008: early stopping
0.2359690691316218 0.9242424368858337
[[49  2]
 [ 3 12]]
              precision    recall  f1-score   support

    Rejected     0.9423    0.9608    0.9515        51
     Allowed     0.8571    0.8000    0.8276        15

    accuracy  

In [10]:
import matplotlib.pyplot as plt
loss_train = history.history['train_loss']
loss_val = history.history['val_loss']
epochs = range(1,10)
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

TypeError: 'History' object is not subscriptable

In [None]:
loss_train = history.history['acc']
loss_val = history.history['val_acc']
epochs = range(1,10)
plt.plot(epochs, loss_train, 'g', label='Training accuracy')
plt.plot(epochs, loss_val, 'b', label='validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()