## LSTM for Multiclass Classification

In [None]:
import csv  
import numpy as np 
import pandas as pd
import re
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay
from keras.layers import Dropout
from sklearn.utils import class_weight

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 400
EMBEDDING_DIM = 100
BATCH_SIZES = [16, 32, 64]
EPOCHS = [6, 8, 10]
CONFIGS = [
["processed_question_locution"],
["processed_response_locutions"],
["processed_preceding_locution"],
["processed_question_locution", "processed_response_locutions"],
["processed_question_locution", "processed_preceding_locution"],
["processed_preceding_locution", "processed_response_locutions"],
["processed_question_locution", "processed_preceding_locution", "processed_response_locutions"]
]

In [None]:
def create_X_and_Y(config, data):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
    combined_text = data[config[0]].astype(str)
    for column in config[1:]:
        combined_text += ' ' + data[column].astype(str)
    tokenizer.fit_on_texts(combined_text)
    X = tokenizer.texts_to_sequences(combined_text.values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    
    y = pd.get_dummies(data['question_type']).values
    return X, y

In [None]:
def create_model(X):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def create_confusion_matrix_and_accuracy(Y_predicted, y_test, batch_size, epochs, config):
    y_pred = []
    for subarr in Y_predicted:
        max_index = np.argmax(subarr)
        subarr_output = np.zeros_like(subarr)
        subarr_output[max_index] = int(1)
        y_pred.append(subarr_output)

    y_pred = np.array(y_pred).tolist()
    y_pred = [[int(num) for num in arr] for arr in y_pred]
    y_pred = np.array(y_pred)

    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)
    accuracy_balanced =  balanced_accuracy_score(y_true, y_pred)
    
    cm = confusion_matrix(y_true, y_pred)
    matrix = ConfusionMatrixDisplay(confusion_matrix=cm).plot();
    
    config = [entry.split()[0] for entry in config]
    folder_name = ' '.join(config)
    plt.savefig('path/confusion matrices/LSTM-multi/' + folder_name + '/batch' + str(batch_size) + ' epochs' + str(epochs) + '.png')
     
    return accuracy, accuracy_balanced

In [None]:
train_data = pd.read_csv('path/QT-Questions-train-over.csv')
test_data = pd.read_csv('path/QT-Questions-test.csv')
    
train_data['processed_preceding_locution'] = train_data['processed_preceding_locution'].apply(lambda x: str(x))
train_data['processed_preceding_locution'] = train_data['processed_preceding_locution'].apply(lambda x: re.sub(r'^nan$', ' ', str(x)))

test_data['processed_preceding_locution'] = test_data['processed_preceding_locution'].apply(lambda x: str(x))
test_data['processed_preceding_locution'] = test_data['processed_preceding_locution'].apply(lambda x: re.sub(r'^nan$', ' ', str(x)))

train_data['processed_response_locutions'] = train_data['processed_response_locutions'].apply(lambda x: str(x))
train_data['processed_response_locutions'] = train_data['processed_response_locutions'].apply(lambda x: re.sub(r'[\[\]\'\,"]', ' ', str(x)))

test_data['processed_response_locutions'] = test_data['processed_response_locutions'].apply(lambda x: str(x))
test_data['processed_response_locutions'] = test_data['processed_response_locutions'].apply(lambda x: re.sub(r'[\[\]\'\",]', ' ', str(x)))
    
with open('path/results/LSTM-multi.csv', 'w', encoding='UTF8') as file:
        
    writer = csv.writer(file)
    writer.writerow(['Configuration', 'Batches', 'Epochs', 'Accuracy', 'Balanced accuracy'])
        
    for config in CONFIGS:
        for batch_size in BATCH_SIZES:
            for epochs in EPOCHS:
                print('Now testing model with config: ' + str(config) + ', batch_size: ' + str(batch_size) + ', epochs: ' + str(epochs))

                X_train, y_train = create_X_and_Y(config, train_data)
                X_test, y_test = create_X_and_Y(config, test_data)
                    
                model = create_model(X_train)
                
                history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])#, class_weight=dict(enumerate(class_weights)))
                Y_predicted = model.predict(X_test)
                accuracy, accuracy_balanced = create_confusion_matrix_and_accuracy(Y_predicted, y_test, batch_size, epochs, config)
                writer.writerow([config, batch_size, epochs, accuracy, accuracy_balanced])