# Import

In [26]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
from Saves.HelperFunctions import *
from Preprocessing import preprocess_text, category_encoding

In [28]:
test_bool = False # set to True to include the test

In [29]:
train_data = pd.read_csv('Data/train.csv')
if test_bool:
    test_data = pd.read_csv('Data/test.csv')

In [30]:
train_data.head(2)

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM


In [31]:
if test_bool:
    print(test_data.head(3))

# Preprocessing

In [32]:
# Drop Nan
print(f"train_data.shape before {train_data.shape}")
train_data = train_data.dropna(subset=['Discussion'])
print(f"train_data.shape after {train_data.shape}")

Drop Nan...
	train_data.shape before (24989, 3)
	train_data.shape after (24646, 3)


In [33]:
pre_method = 2

# NLP preprocessing on text
train_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in train_data['Discussion']]
if test_bool:
    test_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in test_data['Discussion']]

start preprocessing...


In [34]:
# Feature Extraction: TF-IDF
vectorizer = TfidfVectorizer()
vectorizer.fit(train_Discussion_preprocessed)

X_train = vectorizer.transform(train_Discussion_preprocessed)
if test_bool:
    X_test = vectorizer.transform(test_Discussion_preprocessed)

TF-IDF...


In [35]:
# Encoding Y_train
Y_train = train_data['Category'].map(category_encoding)

Encoding Y_train...


In [36]:
print(f"X_train.shape: {X_train.shape}")
if test_bool:
    print(f"X_test.shape: {X_test.shape}")
print(f"Y_train.shape: {Y_train.shape}")

X_train.shape: (24646, 42192)
Y_train.shape: (24646,)


In [37]:
Y_train.head(3)

0    1
1    4
2    4
Name: Category, dtype: int64

# Feedforward Neural Network (FFNN)

In [38]:
num_classes = 5
input_size = X_train.shape[1]

# Define the model
model = models.Sequential([
    layers.Input(shape=(input_size,)),  # Input layer with the specified input size
    layers.Dense(128, activation='relu'),  # Hidden layer 1 (with 128 neurons)
    layers.Dense(64, activation='relu'),   # Hidden layer 2 (with 64 neurons)
    layers.Dense(num_classes, activation='softmax')  # Output layer with softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', 
                loss='categorical_crossentropy',  # For multi-class classification
                metrics=['accuracy'])

In [39]:
# Assuming Y_train contains labels in integer form (e.g., [0, 1, 2, 3, 4])
Y_train_categorical = to_categorical(Y_train, num_classes=num_classes)

In [40]:
epochs = 2
batch_size = 32
model.fit(X_train, Y_train_categorical, epochs=epochs, batch_size=batch_size)

Epoch 1/2
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 82ms/step - accuracy: 0.5592 - loss: 1.1254
Epoch 2/2
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 80ms/step - accuracy: 0.8571 - loss: 0.4309


<keras.src.callbacks.history.History at 0x2ca881ab040>

## Evaluation

### Train

In [41]:
# Predict the labels for the test set
train_predictions = model.predict(X_train)

# If it's a multi-class classification task, get the predicted class for each sample
Y_train_pred = np.argmax(train_predictions, axis=1)

[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step


In [42]:
train_accuracy = accuracy_score(Y_train_pred, Y_train)
print(f"Train Accuracy: {train_accuracy}")

Train Accuracy: 0.9353647650734399


In [None]:
if input('Press 0 to save the model') == '0':
    file_name = f'FFNN-m{pre_method}-e{epochs}-a{int(train_accuracy*100)}'
    model.save(f'Models/FFNN/{file_name}.h5')
    print(f'{file_name} saved successfully')

### Test

In [None]:
if test_bool:
    # Predict the labels for the test set
    test_predictions = model.predict(X_test)

    # If it's a multi-class classification task, get the predicted class for each sample
    Y_test_pred = np.argmax(test_predictions, axis=1)

    if input('Press 0 to save the test predictions') == '0':
        file_name = f'FFNN-m{pre_method}-e{epochs}'
        save_csv(data=Y_test_pred, file_name=file_name, header=['SampleID', 'Category'], numbering=True)