# Import

In [10]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import GRU

In [2]:
from Saves.HelperFunctions import *
from Preprocessing import preprocess_text, category_encoding

In [3]:
test_bool = False # set to True to include the test

In [4]:
train_data = pd.read_csv('Data/train.csv')
if test_bool:
    test_data = pd.read_csv('Data/test.csv')

In [5]:
train_data.head(2)

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM


In [6]:
if test_bool:
    print(test_data.head(3))

# Preprocessing

In [7]:
print('Drop Nan...')
print(f"\ttrain_data.shape before {train_data.shape}")
train_data = train_data.dropna(subset=['Discussion'])
print(f"\ttrain_data.shape after {train_data.shape}")

Drop Nan...
	train_data.shape before (24989, 3)
	train_data.shape after (24646, 3)


In [8]:
pre_method = 2

# NLP preprocessing on text
train_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in train_data['Discussion']]
if test_bool:
    test_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in test_data['Discussion']]

start preprocessing...


In [None]:
# Feature Extraction

num_words = 20000

tokenizer = Tokenizer(num_words=num_words)  # Set max vocabulary size
tokenizer.fit_on_texts(train_Discussion_preprocessed) # Fit tokenizer on training data 

X_train_seq = tokenizer.texts_to_sequences(train_Discussion_preprocessed)
if test_bool:    
    X_test_seq = tokenizer.texts_to_sequences(test_Discussion_preprocessed)

seq_len = 100

# Padding
X_train = pad_sequences(X_train_seq, maxlen=seq_len, padding='post')
if test_bool:    
    X_test = pad_sequences(X_test_seq, maxlen=seq_len, padding='post')

	Num of words: 20000
Tokenizer...
finish fitting...


In [12]:
# Encoding Y_train
Y_train = train_data['Category'].map(category_encoding)

Encoding Y_train...


In [13]:
print(f"X_train.shape: {X_train.shape}")
if test_bool:    
    print(f"X_test.shape: {X_test.shape}")
print(f"Y_train.shape: {Y_train.shape}")

X_train.shape: (24646, 100)
Y_train.shape: (24646,)


In [14]:
Y_train.head(3)

0    1
1    4
2    4
Name: Category, dtype: int64

# RNN

In [15]:
print(f"num_words: {num_words}")
print(f"seq_len: {seq_len}")

num_words: 20000
seq_len: 100


In [17]:
num_classes = 5
# Define the model
model = Sequential()

# Embedding layer to convert word indices into dense vectors of fixed size
model.add(Embedding(input_dim=num_words, output_dim=128, input_length=seq_len))

if False:
    # RNN layer (SimpleRNN)
    model.add(SimpleRNN(units=64, return_sequences=False))
elif True:
    # GRU layer
    model.add(GRU(units=64, return_sequences=False, activation='tanh'))

# Dropout for regularization
model.add(Dropout(0.5))

# Fully connected layer for classification
model.add(Dense(5, activation='softmax'))  # Adjust output units for the number of classes

# Explicitly build the model
model.build(input_shape=(None, seq_len))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [18]:
epochs = 9
batch_size = 32
validation_split = 0.2

In [19]:
# Assuming Y_train contains labels in integer form (e.g., [0, 1, 2, 3, 4])
Y_train_categorical = to_categorical(Y_train, num_classes=num_classes)

In [None]:
model.fit(X_train, Y_train_categorical, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

Epoch 1/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 67ms/step - accuracy: 0.2275 - loss: 1.6013 - val_accuracy: 0.2325 - val_loss: 1.5984
Epoch 2/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 66ms/step - accuracy: 0.2455 - loss: 1.5860 - val_accuracy: 0.3114 - val_loss: 1.4894
Epoch 3/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 67ms/step - accuracy: 0.3260 - loss: 1.4624 - val_accuracy: 0.4138 - val_loss: 1.3500
Epoch 4/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 68ms/step - accuracy: 0.5424 - loss: 1.1460 - val_accuracy: 0.6535 - val_loss: 0.9212
Epoch 5/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 70ms/step - accuracy: 0.7537 - loss: 0.7139 - val_accuracy: 0.6807 - val_loss: 0.8752
Epoch 6/9
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 70ms/step - accuracy: 0.8277 - loss: 0.5073 - val_accuracy: 0.6621 - val_loss: 0.9607
Epoch 7/9
[1m617/617

<keras.src.callbacks.history.History at 0x1cbaf7c0b50>

## Evaluation

### Train

In [None]:
# Predict the labels for the test set
train_predictions = model.predict(X_train)

# If it's a multi-class classification task, get the predicted class for each sample
Y_train_pred = np.argmax(train_predictions, axis=1)

[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step


In [22]:
train_accuracy = accuracy_score(Y_train_pred, Y_train)
print(f"Train Accuracy: {train_accuracy}")

Train Accuracy: 0.8743406637993995


In [None]:
if input('Press 0 to save the model') == '0':
    file_name = f'GRU-m{pre_method}-e{epochs}-a{int(train_accuracy*100)}'
    model.save(f'Models/RNN/{file_name}.h5')
    print(f'{file_name} saved successfully')

### Test

In [None]:
if test_bool:    
    # Predict the labels for the test set
    test_predictions = model.predict(X_test)

    # If it's a multi-class classification task, get the predicted class for each sample
    Y_test_pred = np.argmax(test_predictions, axis=1)

    if input('Press 0 to save the test predictions') == '0':
        file_name = f'GRU-m{pre_method}-e{epochs}'
        save_csv(data=Y_test_pred, file_name=file_name, header=['SampleID', 'Category'], numbering=True)