# Import

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
import tensorflow as tf
from tensorflow.keras import layers, models
# from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.models import Model

In [2]:
from Saves.HelperFunctions import *
from Preprocessing import preprocess_text, category_encoding, load_glove_embeddings

In [10]:
test_bool = False # set to True to include the test

In [4]:
train_data = pd.read_csv('Data/train.csv')
if test_bool:
    test_data = pd.read_csv('Data/test.csv')

In [5]:
train_data.head(2)

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM


In [6]:
if test_bool:
    print(test_data.head(3))

   SampleID                                         Discussion
0         1  Managing cash flow effectively is crucial for ...
1         2  Civic engagement plays a key role in a democra...
2         3  Proper warm-ups and cool-downs are essential t...


# Preprocessing

In [7]:
# Drop Nan
print(f"train_data.shape before {train_data.shape}")
train_data = train_data.dropna(subset=['Discussion'])
print(f"train_data.shape after {train_data.shape}")

train_data.shape before (24989, 3)
train_data.shape after (24646, 3)


In [8]:
pre_method = 2

# NLP preprocessing on text
train_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in train_data['Discussion']]
if test_bool:
    test_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in test_data['Discussion']]

In [9]:
# Feature Extraction

# num_words = 20000
# tokenizer = Tokenizer(num_words=num_words)  # Set max vocabulary size
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(train_Discussion_preprocessed) # Fit tokenizer on training data 

X_train_seq = tokenizer.texts_to_sequences(train_Discussion_preprocessed)
if test_bool:    
    X_test_seq = tokenizer.texts_to_sequences(test_Discussion_preprocessed)

seq_len = 100
# seq_len = max(len(seq) for seq in X_train_seq)

X_train = pad_sequences(X_train_seq, maxlen=seq_len, padding='post')
if test_bool:    
    X_test = pad_sequences(X_test_seq, maxlen=seq_len, padding='post')

In [None]:
# import pickle
# with open('Saves/Delivaries/trans_tokenizer02.pkl', 'wb') as file:
#     pickle.dump(tokenizer, file)

In [16]:
# Load GloVe embeddings
glove_path = 'Models/Transformers/glove.6B.100d.txt'
embedding_dim = 100

word_index = tokenizer.word_index
embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim)

In [17]:
# Encoding Y_train
Y_train = train_data['Category'].map(category_encoding)

In [18]:
print(f"X_train.shape: {X_train.shape}")
if test_bool:    
    print(f"X_test.shape: {X_test.shape}")
print(f"Y_train.shape: {Y_train.shape}")

X_train.shape: (24646, 100)
X_test.shape: (10557, 100)
Y_train.shape: (24646,)


In [19]:
Y_train.head(3)

0    1
1    4
2    4
Name: Category, dtype: int64

# Transformers

In [10]:
num_heads=4
ff_dim=128
num_classes=5
print(f'seq_len = {seq_len}')

seq_len = 839


In [21]:
input_layer = Input(shape=(seq_len,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=seq_len,
    trainable=False
)(input_layer)

# Transformer block
attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(
    query=embedding_layer, key=embedding_layer, value=embedding_layer
)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output + embedding_layer)

ff_output = Dense(ff_dim, activation='relu')(attention_output)
ff_output = Dense(embedding_dim)(ff_output)
ff_output = LayerNormalization(epsilon=1e-6)(ff_output + attention_output)

# Pooling and output
global_avg_pool = GlobalAveragePooling1D()(ff_output)
dropout_layer = Dropout(0.3)(global_avg_pool)
output_layer = Dense(num_classes, activation='softmax')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [22]:
epochs = 9
batch_size = 32
validation_split = 0.2

In [19]:
# # Assuming Y_train contains labels in integer form (e.g., [0, 1, 2, 3, 4])
# Y_train_categorical = to_categorical(Y_train, num_classes=num_classes)

In [23]:
model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

Epoch 1/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 93ms/step - accuracy: 0.5087 - loss: 1.2777 - val_accuracy: 0.6387 - val_loss: 0.9167
Epoch 2/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 95ms/step - accuracy: 0.6406 - loss: 0.9232 - val_accuracy: 0.6485 - val_loss: 0.8912
Epoch 3/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 94ms/step - accuracy: 0.6661 - loss: 0.8642 - val_accuracy: 0.6552 - val_loss: 0.8821
Epoch 4/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 93ms/step - accuracy: 0.6645 - loss: 0.8546 - val_accuracy: 0.6619 - val_loss: 0.8666
Epoch 5/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 115ms/step - accuracy: 0.6776 - loss: 0.8380 - val_accuracy: 0.6671 - val_loss: 0.8560
Epoch 6/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 133ms/step - accuracy: 0.6775 - loss: 0.8200 - val_accuracy: 0.6588 - val_loss: 0.8812
Epoch 7/9
[1m617/61

<keras.src.callbacks.history.History at 0x1f3f06bfc40>

## Evaluation

### Train

In [24]:
# Predict the labels for the test set
train_predictions = model.predict(X_train)
train_predictions

# If it's a multi-class classification task, get the predicted class for each sample
Y_train_pred = np.argmax(train_predictions, axis=1)

[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 33ms/step


array([[1.0309905e-02, 6.2808865e-01, 2.5196627e-02, 8.3644941e-02,
        2.5275987e-01],
       [6.4566001e-02, 1.2638511e-01, 2.4156246e-01, 2.2584188e-01,
        3.4164461e-01],
       [7.2269194e-04, 2.5822196e-04, 5.4064617e-03, 3.1870892e-03,
        9.9042553e-01],
       ...,
       [1.6632278e-01, 2.2475778e-01, 2.7396584e-01, 2.2523674e-01,
        1.0971674e-01],
       [2.0351753e-01, 6.2246877e-03, 2.9357573e-01, 4.8846051e-01,
        8.2216123e-03],
       [4.2860672e-02, 1.3005133e-02, 8.7519377e-01, 3.9751612e-02,
        2.9188806e-02]], dtype=float32)

In [26]:
train_accuracy = accuracy_score(Y_train_pred, Y_train)
print(f"Train Accuracy: {train_accuracy}")

Train Accuracy: 0.6986529254240039


In [27]:
if input('Press 0 to save the model') == '0':
    file_name = f'Trans-m{pre_method}-e{epochs}-a{int(train_accuracy*100)}'
    model.save(f'Models/Transformers/{file_name}.h5')
    print(f'{file_name} saved successfully')



Trans-m2-e9-a69 saved successfully


### Test

In [28]:
if test_bool:    
    # Predict the labels for the test set
    test_predictions = model.predict(X_test)

    # If it's a multi-class classification task, get the predicted class for each sample
    Y_test_pred = np.argmax(test_predictions, axis=1)

    if input('Press 0 to save the test predictions') == '0':
        file_name = f'Trans-m{pre_method}-e{epochs}'
        save_csv(data=Y_test_pred, file_name=file_name, header=['SampleID', 'Category'], numbering=True)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step
Data saved to Saves/Trans-m2-e9.csv
