In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
from keras.models import load_model
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
from Saves.HelperFunctions import *
from Preprocessing import preprocess_text, load_glove_embeddings

In [4]:
data = pd.read_csv('Data/test.csv')

In [5]:
# Drop Nan
print(f"train_data.shape before {data.shape}")
data = data.dropna(subset=['Discussion'])
print(f"train_data.shape after {data.shape}")

train_data.shape before (10557, 2)
train_data.shape after (10557, 2)


In [6]:
X_test = data['Discussion'].astype(str)

In [7]:
X_test.head(2)

0    Managing cash flow effectively is crucial for ...
1    Civic engagement plays a key role in a democra...
Name: Discussion, dtype: object

In [8]:
saves_dir = 'Delivaries'

# Preprocessing

In [9]:
# preprocessing
pre_method = 2
test_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in X_test]

# FFNN

In [None]:
# load TF-IDF
with open(f'Saves/{saves_dir}/tfidf_vectorizer01.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
    print('vectorizer loaded successfully...')

ffnn_X_test = vectorizer.transform(test_Discussion_preprocessed)

In [None]:
# load FFNN model

# ffnn_model_path = 'FFNN-m4-e2-a92.h5'
ffnn_model_path = 'FFNN-m2-e3-a93.h5'
ffnn_model = load_model(ffnn_model_path)

In [None]:
ffnn_predictions = ffnn_model.predict(ffnn_X_test)

# If it's a multi-class classification task, get the predicted class for each sample
ffnn_Y_pred = np.argmax(ffnn_predictions, axis=1)

In [None]:
if input('Press 0 to save the predictions') == '0':
    save_csv(data=ffnn_Y_pred, file_name=f'{saves_dir}/FFNN', header=['SampleID', 'Category'], numbering=True)

# GRU

In [None]:
# load tokenizer
with open(f'Saves/{saves_dir}/gru_tokenizer01.pkl', 'rb') as file:
    gru_tokenizer = pickle.load(file)
    print('tokenizer loaded successfully...')

In [None]:
X_test_seq = gru_tokenizer.texts_to_sequences(test_Discussion_preprocessed)

seq_len = 100
gru_X_test = pad_sequences(X_test_seq, maxlen=seq_len, padding='post')

In [None]:
# load GRU model

# gru_model_path = 'Models\\RNN\\GRU-e10-a81.h5'
# gru_model_path = 'Models\\RNN\\GRU-m2-e9-a85'
gru_model_path = 'Models\\RNN\\GRU-m2-e9-a80.h5'
gru_model = load_model(gru_model_path)

In [None]:
# Predict the labels for the test set
gru_predictions = gru_model.predict(gru_X_test)

# If it's a multi-class classification task, get the predicted class for each sample
gru_Y_pred = np.argmax(gru_predictions, axis=1)

In [None]:
if input('Press 0 to save the predictions') == '0':
    save_csv(data=gru_Y_pred, file_name=f'{saves_dir}/GRU', header=['SampleID', 'Category'], numbering=True)

# Transformers

In [10]:
# load tokenizer
with open(f'Saves/{saves_dir}/trans_tokenizer02.pkl', 'rb') as file:
    trans_tokenizer = pickle.load(file)
    print('tokenizer loaded successfully...')

tokenizer loaded successfully...


In [11]:
X_test_seq = trans_tokenizer.texts_to_sequences(test_Discussion_preprocessed)

seq_len = 100
# seq_len = 839 # max

trans_X_test = pad_sequences(X_test_seq, maxlen=seq_len, padding='post')

In [16]:
# Load GloVe embeddings
glove_path = 'Models/Transformers/glove.6B.100d.txt'
embedding_dim = 100

word_index = trans_tokenizer.word_index
embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim)

In [18]:
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.models import Model

In [20]:
num_heads=4
ff_dim=128
num_classes=5
print(f'seq_len = {seq_len}')

seq_len = 100


In [21]:
input_layer = Input(shape=(seq_len,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=seq_len,
    trainable=False
)(input_layer)

# Transformer block
attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(
    query=embedding_layer, key=embedding_layer, value=embedding_layer
)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output + embedding_layer)

ff_output = Dense(ff_dim, activation='relu')(attention_output)
ff_output = Dense(embedding_dim)(ff_output)
ff_output = LayerNormalization(epsilon=1e-6)(ff_output + attention_output)

# Pooling and output
global_avg_pool = GlobalAveragePooling1D()(ff_output)
dropout_layer = Dropout(0.3)(global_avg_pool)
output_layer = Dense(num_classes, activation='softmax')(dropout_layer)

trans_model = Model(inputs=input_layer, outputs=output_layer)
trans_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
trans_model.summary()

In [32]:
# load Transformers model

# trans_model_path = 'Models\\Transformers\\Trans-m2-e9-a69.h5'
# trans_model_path = 'Models\\Transformers\\Trans-m2-e15-a72.h5'
# trans_model = load_model(trans_model_path)

trans_weight_path = 'Models\\Transformers\\Trans-m2-e9-a69.weights.h5'
trans_model.load_weights(trans_weight_path)

In [None]:
# trans_model.save_weights('Models\\Transformers\\Trans-m2-e9-a69.weights.h5')

In [33]:
# Predict the labels for the test set
trans_predictions = trans_model.predict(trans_X_test)

# If it's a multi-class classification task, get the predicted class for each sample
trans_Y_pred = np.argmax(trans_predictions, axis=1)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step


In [None]:
if input('Press 0 to save the predictions') == '0':
    save_csv(data=trans_Y_pred, file_name=f'{saves_dir}/Transformers', header=['SampleID', 'Category'], numbering=True)