# Import

In [1]:
import pandas as pd
import numpy as np
import csv
import os

In [2]:
# nlp
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# # Define preprocessing functions
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [4]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [5]:
train_data = pd.read_csv('train.csv')

In [6]:
train_data.head(2)

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM


# Helper Functions

In [7]:
def save_preprocessed_data(data, pre_method):
    file_path = f"Pre_method_{pre_method}.csv"
    with open(file_path, mode="w", newline="") as file:
        writer = csv.writer(file)
        for item in data:  # Start numbering from 1
            writer.writerow([item])

    print(f"Data saved to {file_path}")

In [8]:
def load_preprocessed_data(pre_method):
    file_path = f"Pre_method_{pre_method}.csv"
    loaded_data = []
    with open(file_path, mode="r") as file:
        reader = csv.reader(file)
        for row in reader:
            loaded_data.append(row[0])
        
    return loaded_data

# Preprocessing

## Functions

In [9]:
def lemmatization(tokens, tagging=False):
    lemmatized_tokens = []
    if tagging:
        tagged_tokens = nltk.pos_tag(tokens)
        for word, tag in tagged_tokens:
            if tag.startswith('NN'):    # Nouns
                lemma = lemmatizer.lemmatize(word, pos='n')
            elif tag.startswith('VB'):  # Verbs
                lemma = lemmatizer.lemmatize(word, pos='v')
            elif tag.startswith('JJ'):  # Adjectives
                lemma = lemmatizer.lemmatize(word, pos='a')
            else:
                lemma = lemmatizer.lemmatize(word)
            lemmatized_tokens.append(lemma)
    else:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return lemmatized_tokens

In [10]:
def stemming(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [11]:
def cleanText(tokens, choice=2):
    cleaned_text = ""

    if choice == 1: # Steaming only
        stemmed_tokens = stemming(tokens)
        cleaned_text = ' '.join(stemmed_tokens)

    elif choice == 2: # Lemmatization without tagging
        lemmatized_tokens = lemmatization(tokens, False)
        cleaned_text = ' '.join(lemmatized_tokens)

    elif choice == 3: # Lemmatization with tagging
        lemmatized_tokens = lemmatization(tokens, True)
        cleaned_text = ' '.join(lemmatized_tokens)

    elif choice == 4: # Lemmatization without tagging and Steaming
        lemmatized_tokens = lemmatization(tokens, False)
        stemmed_tokens = stemming(lemmatized_tokens)
        cleaned_text = ' '.join(stemmed_tokens)

    elif choice == 5: # Lemmatization with tagging and Steaming
        lemmatized_tokens = lemmatization(tokens, True)
        stemmed_tokens = stemming(lemmatized_tokens)
        cleaned_text = ' '.join(stemmed_tokens)


    return cleaned_text

In [12]:
def preprocess_text(text, pre_method=2):
    # Tokenization
    text = text.replace('\\n', ' ')
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    
    cleaned_text = cleanText(tokens, pre_method)
    return cleaned_text

## Drop NaN

In [13]:
print(train_data.shape)
train_data = train_data.dropna(subset=['Discussion'])
print(train_data.shape)

(24989, 3)
(24646, 3)


## Column Discussion

In [14]:
# train_Discussion = train_data['Discussion']

In [15]:
pre_method = 2
if os.path.exists(f"Pre_method_{pre_method}.csv"):
    print("load file, no need to preprocess")
    train_Discussion_preprocessed = load_preprocessed_data(pre_method)
else:
    train_Discussion_preprocessed = [preprocess_text(discussion, pre_method) for discussion in train_data['Discussion']]
    save_preprocessed_data(train_Discussion_preprocessed, pre_method)

load file, no need to preprocess


In [16]:
# # Convert to lowercase, split into words, and add to a set
# unique_words = set()
# for sentence in train_Discussion_preprocessed:
#     words = sentence.split()  # Split
#     unique_words.update(words)       # Add words to the set

# print("Num of Unique words:", len(unique_words))

In [17]:
# save_preprocessed_data(unique_words, 0)

### Feature Extraction (TF-IDF)

In [18]:
# Compute TF-IDF for training and test data
vectorizer = TfidfVectorizer()
vectorizer.fit(train_Discussion_preprocessed)

In [19]:
X_train = vectorizer.transform(train_Discussion_preprocessed)

In [20]:
X_train.shape

(24646, 49464)

## Column Category

In [21]:
category_encoding = {
    "Politics":0,
    "Sports":1,
    "Media":2,
    "Market & Economy":3,
    "STEM":4
}

In [None]:
Y_train = train_data['Category'].map(category_encoding)

In [23]:
Y_train.head(3)

0    1
1    4
2    4
Name: Category, dtype: int64

In [24]:
Y_train.shape

(24646,)

# Models 

## Logistic Regression (just for experience)

In [26]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score

In [27]:
logistic_regression_model = linear_model.LogisticRegression()
logistic_regression_model.fit(X_train, Y_train)

Y_train_pred = logistic_regression_model.predict(X_train)
print(f"Train Accuracy: {accuracy_score(Y_train, Y_train_pred)}")

Train Accuracy: 0.852430414671752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Feedforward Neural Network (FFNN)

In [28]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [29]:
# Define the input size and number of classes
input_size = X_train.shape[1]  # Number of features in each input (e.g., vocabulary size or TF-IDF dimension)
num_classes = 5  

# Define the model
model = models.Sequential([
    layers.Input(shape=(input_size,)),  # Input layer with the specified input size
    layers.Dense(128, activation='relu'),  # Hidden layer 1 (with 128 neurons)
    layers.Dense(64, activation='relu'),   # Hidden layer 2 (with 64 neurons)
    layers.Dense(num_classes, activation='softmax')  # Output layer with softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',  # For multi-class classification
              metrics=['accuracy'])

# Summary of the model
model.summary()

In [30]:
from tensorflow.keras.utils import to_categorical

# Assuming Y_train contains labels in integer form (e.g., [0, 1, 2, 3, 4])
Y_train = to_categorical(Y_train, num_classes=5)

In [None]:
# Example training (replace X_train, y_train with your actual data)
epochs = 4 #10
model.fit(X_train, Y_train, epochs=epochs, batch_size=32)

Epoch 1/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 93ms/step - accuracy: 0.5623 - loss: 1.1142
Epoch 2/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 100ms/step - accuracy: 0.8736 - loss: 0.3840
Epoch 3/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 92ms/step - accuracy: 0.9408 - loss: 0.1800
Epoch 4/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 101ms/step - accuracy: 0.9626 - loss: 0.1086
Epoch 5/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 101ms/step - accuracy: 0.9731 - loss: 0.0737
Epoch 6/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 103ms/step - accuracy: 0.9789 - loss: 0.0558
Epoch 7/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 103ms/step - accuracy: 0.9834 - loss: 0.0442
Epoch 8/10
[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 102ms/step - accuracy: 0.9840 - loss: 0.0412
Epoch 9/10
[1m771

<keras.src.callbacks.history.History at 0x24d4716ba00>

### Save model

In [None]:
model.save('FFNN.h5')



### Test

In [39]:
Y = train_data['Category']

In [None]:
from tensorflow.keras.models import load_model

model_loaded = load_model('FFNN.h5') if os.path.exists('FFNN.h5') else model



In [None]:
start = 100
for i in range(start, start+5):
    samples = train_Discussion_preprocessed[i]
    print(samples)
    samples_transformed = vectorizer.transform([samples])
    prediction = model_loaded.predict(samples_transformed)
    predicted_class = np.argmax(prediction)
    predicted_class = next((k for k, v in category_encoding.items() if v == predicted_class), None)
    print("Predicted class:", predicted_class)
    print("Actual class:", Y[i])
    print("==========================")

In [60]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_train, Y_train)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m 19/771[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12s[0m 17ms/step - accuracy: 0.9908 - loss: 0.0208

[1m771/771[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.9885 - loss: 0.0279
Test Loss: 0.03108193911612034
Test Accuracy: 0.9873407483100891
