In [None]:
import csv  
import re
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import Dropout
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100
BATCH_SIZE = 32
EPOCHS = 4
CONFIGS = [
['L_node_1', 'L_node_2'],
['context_L_node', 'L_node_1', 'L_node_2'],
['I_node_1', 'I_node_2'],
[ 'context_I_node', 'I_node_1', 'I_node_2']
]

In [None]:
def create_X_and_Y(config, data):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
    combined_text = data[config[0]].astype(str)
    for column in config[1:]:
        combined_text += ' ' + data[column].astype(str)
    tokenizer.fit_on_texts(combined_text)
    X = tokenizer.texts_to_sequences(combined_text.values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    
    y = pd.get_dummies(data['prop_rel']).values
    return X, y

In [None]:
def create_model(X):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def create_confusion_matrix_and_fscore(Y_predicted, y_test, config):
    y_pred = []
    for subarr in Y_predicted:
        max_index = np.argmax(subarr)
        subarr_output = np.zeros_like(subarr)
        subarr_output[max_index] = int(1)
        y_pred.append(subarr_output)

    y_pred = np.array(y_pred).tolist()
    y_pred = [[int(num) for num in arr] for arr in y_pred]
    y_pred = np.array(y_pred)

    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    f_score = f1_score(y_true, y_pred, average='macro') 
    
    cm = confusion_matrix(y_true, y_pred)
    matrix = ConfusionMatrixDisplay(confusion_matrix=cm).plot()
    
    config = [entry.split()[0] for entry in config]
    return f_score

In [None]:
train_data = pd.read_csv('path\dataQT30.csv')
test_data = pd.read_csv('path\dataQT31-40.csv')   

with open('path\results\LSTM_results.csv', 'w', encoding='UTF8') as file:

    writer = csv.writer(file)
    writer.writerow(['Configuration', 'Batches', 'Epochs', 'F-score'])

    for config in CONFIGS:
        print('Now testing model with config: ' + str(config) + ', batch_size: ' + str(BATCH_SIZE) + ', epochs: ' + str(EPOCHS))

        X_train, y_train = create_X_and_Y(config, train_data)
        X_test, y_test = create_X_and_Y(config, test_data)

        model = create_model(X_train)

        history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)
        Y_predicted = model.predict(X_test)
        f_score = create_confusion_matrix_and_fscore(Y_predicted, y_test, config)
        writer.writerow([config, BATCH_SIZE, EPOCHS, f_score])