# Import

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score

In [2]:
from Saves.HelperFunctions import *
from Preprocessing import preprocess

In [3]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
# test_data = None

In [4]:
train_data.head(2)

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM


In [5]:
# test_data.head(3)

# Preprocessing

In [6]:
pre_method = 2
X_train_padded, Y_train, X_test_padded, num_unique_words, max_sequence_length = preprocess(train_data=train_data,
                                                                                            test_data=test_data, 
                                                                                            pre_method=pre_method,
                                                                                            fx_opt=2)

Drop Nan...
	train_data.shape before (24989, 3)
	train_data.shape after (24646, 3)
start preprocessing...
Encoding Y_train...
Calc unique words...
	Num of Unique words: 16458
Tokenizer...
avg_sequence_length = 40
max_sequence_length = 745


In [7]:
print(f"X_train.shape: {X_train_padded.shape}")
print(f"X_test.shape: {X_test_padded.shape}")
print(f"Y_train.shape: {Y_train.shape}")

X_train.shape: (24646, 100)
X_test.shape: (10557, 100)
Y_train.shape: (24646,)


In [8]:
Y_train.head(3)

0    1
1    4
2    4
Name: Category, dtype: int64

# RNN

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout
# from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import GRU

In [12]:
print(f"num_unique_words: {num_unique_words}")
print(f"max_sequence_length: {max_sequence_length}")
max_sequence_length = 100

num_unique_words: 16458
max_sequence_length: 100


In [14]:
num_classes = 5
# Define the model
model = Sequential()

# Embedding layer to convert word indices into dense vectors of fixed size
model.add(Embedding(input_dim=num_unique_words, output_dim=128, input_length=max_sequence_length))

if False:
    # RNN layer (SimpleRNN)
    model.add(SimpleRNN(units=64, return_sequences=False))
elif True:
    # GRU layer
    model.add(GRU(units=64, return_sequences=False, activation='tanh'))

# Dropout for regularization
model.add(Dropout(0.5))

# Fully connected layer for classification
model.add(Dense(5, activation='softmax'))  # Adjust output units for the number of classes

# Explicitly build the model
model.build(input_shape=(None, max_sequence_length))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [15]:
# Assuming Y_train contains labels in integer form (e.g., [0, 1, 2, 3, 4])
Y_train_categorical = to_categorical(Y_train, num_classes=num_classes)

In [32]:
epochs = 3
batch_size = 32
validation_split = 0.2

In [40]:
model.fit(X_train_padded, Y_train_categorical, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

Epoch 1/3
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 64ms/step - accuracy: 0.9143 - loss: 0.2301 - val_accuracy: 0.5746 - val_loss: 1.9273
Epoch 2/3
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 63ms/step - accuracy: 0.9182 - loss: 0.2199 - val_accuracy: 0.5669 - val_loss: 1.8391
Epoch 3/3
[1m617/617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 64ms/step - accuracy: 0.9180 - loss: 0.2093 - val_accuracy: 0.5694 - val_loss: 2.0709


<keras.src.callbacks.history.History at 0x2b4dc423ee0>

## Evaluation

### Train

In [None]:
# from tensorflow.keras.models import load_model
# model_loaded = load_model('Models/RNN/RNN-m2-e5-a83.h5') if os.path.exists('Models/RNN/RNN-m2-e5-a83.h5') else model

In [41]:
# Predict the labels for the test set
train_predictions = model.predict(X_train_padded)

# If it's a multi-class classification task, get the predicted class for each sample
Y_train_pred = np.argmax(train_predictions, axis=1)

[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step


In [42]:
train_accuracy = accuracy_score(Y_train_pred, Y_train)
print(f"Train Accuracy: {train_accuracy}")

Train Accuracy: 0.8585165949849874


In [43]:
file_name = f'GRU-e18-a{int(train_accuracy*100)}'
model.save(f'Models/RNN/{file_name}.h5')



### Test

In [44]:
# Predict the labels for the test set
test_predictions = model.predict(X_test_padded)

# If it's a multi-class classification task, get the predicted class for each sample
Y_test_pred = np.argmax(test_predictions, axis=1)

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step


In [45]:
file_name = f'GRU-e18'
save_csv(data=Y_test_pred, file_name=file_name, header=['SampleID', 'Category'], numbering=True)

Data saved to Saves/GRU-e18.csv
