# Import

In [31]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import GRU

In [30]:
from Saves.HelperFunctions import *
from Preprocessing import preprocess_text, category_encoding

In [25]:
test_bool = True # set to True to include the test

In [26]:
train_data = pd.read_csv('Data/train.csv')
if test_bool:
    test_data = pd.read_csv('Data/test.csv')

In [6]:
train_data.head(2)

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM


In [27]:
if test_bool:
    print(test_data.head(3))

   SampleID                                         Discussion
0         1  Managing cash flow effectively is crucial for ...
1         2  Civic engagement plays a key role in a democra...
2         3  Proper warm-ups and cool-downs are essential t...


# Preprocessing

In [32]:
print('Drop Nan...')
print(f"\ttrain_data.shape before {train_data.shape}")
train_data = train_data.dropna(subset=['Discussion'])
print(f"\ttrain_data.shape after {train_data.shape}")

Drop Nan...
	train_data.shape before (24989, 3)
	train_data.shape after (24646, 3)


In [33]:
pre_method = 2

# NLP preprocessing on text
train_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in train_data['Discussion']]
if test_bool:
    test_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in test_data['Discussion']]

In [34]:
# Feature Extraction

num_words = 20000

tokenizer = Tokenizer(num_words=num_words)  # Set max vocabulary size
tokenizer.fit_on_texts(train_Discussion_preprocessed) # Fit tokenizer on training data 

X_train_seq = tokenizer.texts_to_sequences(train_Discussion_preprocessed)
if test_bool:    
    X_test_seq = tokenizer.texts_to_sequences(test_Discussion_preprocessed)

seq_len = 100

# Padding
X_train = pad_sequences(X_train_seq, maxlen=seq_len, padding='post')
if test_bool:    
    X_test = pad_sequences(X_test_seq, maxlen=seq_len, padding='post')

In [11]:
# Encoding Y_train
Y_train = train_data['Category'].map(category_encoding)

In [35]:
print(f"X_train.shape: {X_train.shape}")
if test_bool:    
    print(f"X_test.shape: {X_test.shape}")
print(f"Y_train.shape: {Y_train.shape}")

X_train.shape: (24646, 100)
X_test.shape: (10557, 100)
Y_train.shape: (24646,)


In [13]:
Y_train.head(3)

0    1
1    4
2    4
Name: Category, dtype: int64

# RNN

In [14]:
print(f"num_words: {num_words}")
print(f"seq_len: {seq_len}")

num_words: 20000
seq_len: 100


In [15]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, LayerNormalization

num_classes = 5
# Define the model
model = Sequential()

# Embedding layer to convert word indices into dense vectors of fixed size
model.add(Embedding(input_dim=num_words, output_dim=128, input_length=seq_len, trainable=True,
        embeddings_regularizer=l2(0.0005)))
model.add(Bidirectional(GRU(256, return_sequences=True, kernel_regularizer=l2(0.0005))))
model.add(LayerNormalization())
model.add(Dropout(0.3))

model.add(Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(0.0005))))
model.add(LayerNormalization())
model.add(Dropout(0.3))

model.add(Bidirectional(GRU(64, return_sequences=False, kernel_regularizer=l2(0.0005))))
model.add(Dropout(0.3))

model.add(Dense(512, activation='relu', kernel_regularizer=l2(0.0005)))
model.add(Dropout(0.4))

model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.0005)))
model.add(Dropout(0.4))

model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          2560000   
                                                                 
 bidirectional (Bidirection  (None, 100, 512)          592896    
 al)                                                             
                                                                 
 layer_normalization (Layer  (None, 100, 512)          1024      
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 100, 512)          0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 100, 256)          493056    
 onal)                                                           
                                                     

In [16]:
epochs = 9
batch_size = 32
validation_split = 0.2

In [17]:
# Assuming Y_train contains labels in integer form (e.g., [0, 1, 2, 3, 4])
Y_train_categorical = to_categorical(Y_train, num_classes=num_classes)

In [18]:
model.fit(X_train, Y_train_categorical, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

Epoch 1/9


Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.src.callbacks.History at 0x18587082850>

## Evaluation

### Train

In [19]:
# Predict the labels for the test set
train_predictions = model.predict(X_train)

# If it's a multi-class classification task, get the predicted class for each sample
Y_train_pred = np.argmax(train_predictions, axis=1)



In [20]:
train_accuracy = accuracy_score(Y_train_pred, Y_train)
print(f"Train Accuracy: {train_accuracy}")

Train Accuracy: 0.8052422299764668


In [21]:
if input('Press 0 to save the model') == '0':
    file_name = f'GRU-m{pre_method}-e{epochs}-a{int(train_accuracy*100)}'
    model.save(f'Models/RNN/{file_name}.h5')
    print(f'{file_name} saved successfully')

GRU-m2-e9-a80 saved successfully


  saving_api.save_model(


### Test

In [37]:
if test_bool:    
    # Predict the labels for the test set
    test_predictions = model.predict(X_test)

    # If it's a multi-class classification task, get the predicted class for each sample
    Y_test_pred = np.argmax(test_predictions, axis=1)

    if input('Press 0 to save the test predictions') == '0':
        file_name = f'GRU-m{pre_method}-e{epochs}-predictions'
        save_csv(data=Y_test_pred, file_name=file_name, header=['SampleID', 'Category'], numbering=True)

Data saved to Saves/GRU-m2-e9-predictions.csv
