# Text Preprocessing

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from gensim.models import Word2Vec
import re

class TextPreprocessor:
    def __init__(self):
        self.bow_vectorizer = CountVectorizer(max_features = 5000)
        self.tfidf_vectorizer = TfidfVectorizer(max_features = 5000)
        # pass
        
    def clean_text(self, text):
        if isinstance(text, float):
            return ""
        text = text.lower()
        # https://medium.com/@siddharthgov01/regular-expressions-from-a-za-z-88cf9cf0abac
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text
    
    def train_bow_vectorizer(self, train_texts):
        self.bow_vectorizer.fit(train_texts)
    
    def train_tfidf_vectorizer(self, train_texts):
        self.tfidf_vectorizer.fit(train_texts)
    

    # https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html#bag-of-words-using-scikit-learn
    def get_bow_features(self, texts, max_features=5000):
        return self.bow_vectorizer.transform(texts)
        # vectorizer = CountVectorizer(max_features=max_features)
        # return vectorizer.fit_transform(texts)

    # https://pages.github.rpi.edu/kuruzj/website_introml_rpi/notebooks/08-intro-nlp/03-scikit-learn-text.html#tf-idf-encoding
    def get_tfidf_features(self, texts, max_features=5000):
        return self.tfidf_vectorizer.transform(texts)
        # vectorizer = TfidfVectorizer(max_features=max_features)
        # return vectorizer.fit_transform(texts)
 
    # https://radimrehurek.com/gensim/models/word2vec.html
    def get_word2vec_features(self, texts, vector_size=100, window=5, min_count=1, workers=4):
        processed_texts = []
        for text in texts:
            if isinstance(text, str):
                cleaned = self.clean_text(text)
                tokens = cleaned.split()
                processed_texts.append(tokens)

        model = Word2Vec(
            sentences=processed_texts,
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            workers=workers
        )
        
        return model

    def get_text_vector(self, text, word2vec_model):
        tokens = self.clean_text(text).split()
        vectors = []
        for token in tokens:
            if token in word2vec_model.wv:
                vectors.append(word2vec_model.wv[token])
                
        if vectors:
            return np.mean(vectors, axis=0)
        return np.zeros(word2vec_model.vector_size)

import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Logistic Regression with TF-IDF Word Processor

In [19]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

preprocessor = TextPreprocessor()
scaler = StandardScaler()
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

preprocessor.train_tfidf_vectorizer(X_train)
X_train_tfidf = preprocessor.get_tfidf_features(X_train)
X_test_tfidf = preprocessor.get_tfidf_features(X_test)

X_train_tfidf_scaled = scaler.fit_transform(X = pd.DataFrame(X_train_tfidf.todense()))
X_test_tfidf_scaled = scaler.transform(X = pd.DataFrame(X_test_tfidf.todense()))
best_iteration = None
best_train_accuracy = None
best_test_accuracy = None
train_accuracies = []
test_accuracies = []

for i in range(1, 51):
    lr_classifier = LogisticRegression(max_iter=i, C=5.0, penalty='l2', random_state=42 )


    lr_classifier.fit(X_train_tfidf_scaled, y_train)


    y_train_pred = lr_classifier.predict(X_train_tfidf_scaled)
    y_test_pred = lr_classifier.predict(X_test_tfidf_scaled)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)
    
    if best_test_accuracy == None or test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        best_train_accuracy = train_accuracy
        best_iteration = i

print("Best Number Iterations:", best_iteration)
print(f"Train Accuracy: {best_train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {best_test_accuracy * 100:.2f}%")

lr_classifier = LogisticRegression(max_iter=best_iteration, C=5.0, penalty='l2', random_state=42 )
lr_classifier.fit(X_train_tfidf_scaled, y_train)
y_test_pred = lr_classifier.predict(X_test_tfidf_scaled)

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

pd.DataFrame({"train accuracy": train_accuracies, "test accuracy": test_accuracies}).to_csv("./results/logistic_regression_tfidf.csv", index = False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Number Iterations: 6
Train Accuracy: 98.94%
Test Accuracy: 94.25%

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2077
           1       0.94      0.94      0.94      2083

    accuracy                           0.94      4160
   macro avg       0.94      0.94      0.94      4160
weighted avg       0.94      0.94      0.94      4160



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Logistic Regression with BOW

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

preprocessor = TextPreprocessor()
scaler = StandardScaler()

train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

preprocessor.train_bow_vectorizer(X_train)
X_train_bow = preprocessor.get_bow_features(X_train)
X_test_bow = preprocessor.get_bow_features (X_test)

X_train_bow_scaled = scaler.fit_transform(X = pd.DataFrame(X_train_bow.todense()))
X_test_bow_scaled = scaler.transform(X = pd.DataFrame(X_test_bow.todense()))

best_iteration = None
best_train_accuracy = None
best_test_accuracy = None
train_accuracies = []
test_accuracies = []

for i in range(1, 51):
    lr_classifier = LogisticRegression(max_iter=i, C=5.0, penalty='l2', random_state=42 )


    lr_classifier.fit(X_train_bow_scaled, y_train)


    y_train_pred = lr_classifier.predict(X_train_bow_scaled)
    y_test_pred = lr_classifier.predict(X_test_bow_scaled)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

    if best_test_accuracy == None or test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        best_train_accuracy = train_accuracy
        best_iteration = i

print("Best Number Iterations:", best_iteration)
print(f"Train Accuracy: {best_train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {best_test_accuracy * 100:.2f}%") 

lr_classifier = LogisticRegression(max_iter=best_iteration, C=5.0, penalty='l2', random_state=42 )

lr_classifier.fit(X_train_bow_scaled, y_train)
y_test_pred = lr_classifier.predict(X_test_bow_scaled)


print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))
pd.DataFrame({"train accuracy": train_accuracies, "test accuracy": test_accuracies}).to_csv("./results/logistic_regression_bow.csv", index = False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Number Iterations: 10
Train Accuracy: 99.22%
Test Accuracy: 94.28%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      2077
           1       0.93      0.96      0.94      2083

    accuracy                           0.94      4160
   macro avg       0.94      0.94      0.94      4160
weighted avg       0.94      0.94      0.94      4160



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Logistic Regression with Word2vec Embedding

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


preprocessor = TextPreprocessor()
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

X_train_text_vectors = []
X_test_text_vectors = []

word2vec_model = preprocessor.get_word2vec_features(X_train)

for text in X_train:
    text_vector = preprocessor.get_text_vector(text, word2vec_model)
    X_train_text_vectors.append(text_vector)
    
for text in X_test:
    text_vector = preprocessor.get_text_vector(text, word2vec_model)
    X_test_text_vectors.append(text_vector)

X_train_text_vectors = pd.DataFrame(X_train_text_vectors)
X_test_text_vectors = pd.DataFrame(X_test_text_vectors)

lr_classifier = LogisticRegression(max_iter=1000, C=5.0, penalty='l2', random_state=42)

lr_classifier.fit(X_train_text_vectors, y_train)

y_train_pred = lr_classifier.predict(X_train_text_vectors)
y_test_pred = lr_classifier.predict(X_test_text_vectors)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Train Accuracy: 89.84%
Test Accuracy: 88.51%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      2077
           1       0.89      0.88      0.88      2083

    accuracy                           0.89      4160
   macro avg       0.89      0.89      0.89      4160
weighted avg       0.89      0.89      0.89      4160





# MLP Model with BOW

In [20]:
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout,BatchNormalization
from keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Source https://www.tensorflow.org/text/tutorials/text_classification_rnn

preprocessor = TextPreprocessor()
scaler = StandardScaler()

train_df = pd.read_csv('train.csv')
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

preprocessor.train_bow_vectorizer(X_train)
X_train_bow = preprocessor.get_bow_features(X_train)
X_test_bow = preprocessor.get_bow_features(X_test)

X_train_bow_scaled = scaler.fit_transform(X = pd.DataFrame(X_train_bow.todense()))
X_test_bow_scaled = scaler.transform(X = pd.DataFrame(X_test_bow.todense()))

num_classes = len(set(train_labels))
y_train_encoded = keras.utils.to_categorical(y_train, num_classes)
y_test_encoded = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_bow_scaled.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer= "Adam", metrics=['accuracy'])


training = model.fit(
    X_train_bow_scaled,
    y_train_encoded,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)


y_pred_prob = model.predict(X_test_bow_scaled)  
y_pred = np.argmax(y_pred_prob, axis=1)   
y_test_labels = np.argmax(y_test_encoded, axis=1) 

print("Accuracy:",  accuracy_score(y_test_labels, y_pred))
print(classification_report(y_test_labels, y_pred))

pd.DataFrame(training.history).to_csv("./results/mlp_bow.csv", index = False)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.8238 - loss: 0.4092 - val_accuracy: 0.9177 - val_loss: 0.2172
Epoch 2/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9392 - loss: 0.1714 - val_accuracy: 0.9177 - val_loss: 0.2253
Epoch 3/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9508 - loss: 0.1472 - val_accuracy: 0.9225 - val_loss: 0.2211
Epoch 4/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.9526 - loss: 0.1391 - val_accuracy: 0.9138 - val_loss: 0.2351
Epoch 5/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.9519 - loss: 0.1344 - val_accuracy: 0.9261 - val_loss: 0.2231
Epoch 6/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9564 - loss: 0.1305 - val_accuracy: 0.9210 - val_loss: 0.2185
Epoch 7/10
[1m416/41

# MLP Model with TF-IDF 

In [21]:
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout,BatchNormalization
from keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Source https://www.tensorflow.org/text/tutorials/text_classification_rnn

preprocessor = TextPreprocessor()
scaler = StandardScaler()

train_df = pd.read_csv('train.csv')
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

preprocessor.train_tfidf_vectorizer(X_train)
X_train_tfidf = preprocessor.get_tfidf_features(X_train)
X_test_tfidf = preprocessor.get_tfidf_features(X_test)

X_train_tfidf_scaled = scaler.fit_transform(X = pd.DataFrame(X_train_tfidf.todense()))
X_test_tfidf_scaled = scaler.transform(X = pd.DataFrame(X_test_tfidf.todense()))

num_classes = len(set(train_labels))
y_train_encoded = keras.utils.to_categorical(y_train, num_classes)
y_test_encoded = keras.utils.to_categorical(y_test, num_classes)


model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_tfidf_scaled.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


training = model.fit(
    X_train_tfidf_scaled,
    y_train_encoded,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)


y_pred_prob = model.predict(X_test_tfidf_scaled)  
y_pred = np.argmax(y_pred_prob, axis=1)   
y_test_labels = np.argmax(y_test_encoded, axis=1) 

print("Accuracy:",  accuracy_score(y_test_labels, y_pred))
print(classification_report(y_test_labels, y_pred))

pd.DataFrame(training.history).to_csv("./results/mlp_tfidf.csv", index = False)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.8285 - loss: 0.3864 - val_accuracy: 0.9270 - val_loss: 0.1853
Epoch 2/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9551 - loss: 0.1236 - val_accuracy: 0.9240 - val_loss: 0.2006
Epoch 3/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9620 - loss: 0.1023 - val_accuracy: 0.9318 - val_loss: 0.1908
Epoch 4/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9679 - loss: 0.0894 - val_accuracy: 0.9246 - val_loss: 0.2061
Epoch 5/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9694 - loss: 0.0889 - val_accuracy: 0.9258 - val_loss: 0.2137
Epoch 6/10
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9754 - loss: 0.0686 - val_accuracy: 0.9294 - val_loss: 0.2230
Epoch 7/10
[1m416/41

# MLP Model with Word2Vec

In [6]:
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Dense, Dropout, BatchNormalization
from keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report

# Source https://www.tensorflow.org/text/tutorials/text_classification_rnn

preprocessor = TextPreprocessor()

train_df = pd.read_csv('train.csv')
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

X_train_text_vectors = []
X_test_text_vectors = []

word2vec_model = preprocessor.get_word2vec_features(X_train)

for text in X_train:
    text_vector = preprocessor.get_text_vector(text, word2vec_model)
    X_train_text_vectors.append(text_vector)
    
for text in X_test:
    text_vector = preprocessor.get_text_vector(text, word2vec_model)
    X_test_text_vectors.append(text_vector)

X_train_text_vectors = pd.DataFrame(X_train_text_vectors)
X_test_text_vectors = pd.DataFrame(X_test_text_vectors)

X_train_word2Vec = X_train_text_vectors.to_numpy()
X_test_word2Vec = X_test_text_vectors.to_numpy()




num_classes = len(set(train_labels))
y_train_encoded = keras.utils.to_categorical(y_train, num_classes)
y_test_encoded = keras.utils.to_categorical(y_test, num_classes)



model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_word2Vec.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(num_classes, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.fit(
    X_train_word2Vec,
    y_train_encoded,
    batch_size=32,
    epochs=50,
    validation_split=0.2
)


y_pred_prob = model.predict(X_test_word2Vec) 
y_pred = np.argmax(y_pred_prob, axis=1)   
y_test_labels = np.argmax(y_test_encoded, axis=1) 


print("Accuracy:", accuracy_score(y_test_labels, y_pred))
print(classification_report(y_test_labels, y_pred))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.7840 - loss: 0.4904 - val_accuracy: 0.8606 - val_loss: 0.3326
Epoch 2/50
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.8565 - loss: 0.3398 - val_accuracy: 0.8657 - val_loss: 0.3136
Epoch 3/50
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.8583 - loss: 0.3319 - val_accuracy: 0.8594 - val_loss: 0.3213
Epoch 4/50
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.8618 - loss: 0.3231 - val_accuracy: 0.8492 - val_loss: 0.3298
Epoch 5/50
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.8636 - loss: 0.3283 - val_accuracy: 0.8612 - val_loss: 0.3220
Epoch 6/50
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.8556 - loss: 0.3389 - val_accuracy: 0.8777 - val_loss: 0.3054
Epoch 7/50
[1m416/416[0m [32m━

# BiLSTM Model

In [None]:
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM
from keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Source https://www.tensorflow.org/text/tutorials/text_classification_rnn

preprocessor = TextPreprocessor()
tokenizer = Tokenizer()
max_sequence_length = 200

train_df = pd.read_csv('train.csv')
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_sequences = pad_sequences(X_train_sequences, maxlen=max_sequence_length)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length)


X_train_sequences = pd.DataFrame(X_train_sequences)
X_test_sequences = pd.DataFrame(X_test_sequences)

num_classes = len(set(train_labels))
y_train_encoded = keras.utils.to_categorical(y_train, num_classes)
y_test_encoded = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))  
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])


model.fit(
    X_train_sequences,
    y_train_encoded,
    batch_size=32,
    epochs=3,
    validation_split=0.2
)


y_pred_prob = model.predict(X_test_sequences)  
y_pred = np.argmax(y_pred_prob, axis=1)   
y_test_labels = np.argmax(y_test_encoded, axis=1) 

print("Accuracy:",  accuracy_score(y_test_labels, y_pred))
print(classification_report(y_test_labels, y_pred))


Epoch 1/3




[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 460ms/step - accuracy: 0.7829 - loss: 0.4229 - val_accuracy: 0.9303 - val_loss: 0.1759
Epoch 2/3
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 410ms/step - accuracy: 0.9734 - loss: 0.0802 - val_accuracy: 0.9351 - val_loss: 0.1999
Epoch 3/3
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 384ms/step - accuracy: 0.9950 - loss: 0.0182 - val_accuracy: 0.9264 - val_loss: 0.2844
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 70ms/step
Accuracy: 0.9305288461538461
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      2077
           1       0.97      0.89      0.93      2083

    accuracy                           0.93      4160
   macro avg       0.93      0.93      0.93      4160
weighted avg       0.93      0.93      0.93      4160



In [41]:
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM
from keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Source https://www.tensorflow.org/text/tutorials/text_classification_rnn

preprocessor = TextPreprocessor()
tokenizer = Tokenizer()
max_sequence_length = 200

train_df = pd.read_csv('train.csv')
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_sequences = pad_sequences(X_train_sequences, maxlen=max_sequence_length)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=max_sequence_length)


X_train_sequences = pd.DataFrame(X_train_sequences)
X_test_sequences = pd.DataFrame(X_test_sequences)

num_classes = len(set(train_labels))
y_train_encoded = keras.utils.to_categorical(y_train, num_classes)
y_test_encoded = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(5, return_sequences=False)))
model.add(Dropout(0.3))
model.add(Dense(5, activation='relu'))
model.add(Dropout(0.3))  
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer = "adam", metrics=['accuracy'])


model.fit(
    X_train_sequences,
    y_train_encoded,
    batch_size=32,
    epochs=3,
    validation_split=0.2
)


y_pred_prob = model.predict(X_test_sequences)  
y_pred = np.argmax(y_pred_prob, axis=1)   
y_test_labels = np.argmax(y_test_encoded, axis=1) 

print("Accuracy:",  accuracy_score(y_test_labels, y_pred))
print(classification_report(y_test_labels, y_pred))


Epoch 1/3




[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 120ms/step - accuracy: 0.7196 - loss: 0.5299 - val_accuracy: 0.8915 - val_loss: 0.2555
Epoch 2/3
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 118ms/step - accuracy: 0.9244 - loss: 0.2158 - val_accuracy: 0.9053 - val_loss: 0.2363
Epoch 3/3
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 119ms/step - accuracy: 0.9759 - loss: 0.0871 - val_accuracy: 0.9198 - val_loss: 0.2324
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step
Accuracy: 0.9225961538461539
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      2077
           1       0.94      0.90      0.92      2083

    accuracy                           0.92      4160
   macro avg       0.92      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



# Bert Model

In [7]:
# import keras
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizer

preprocessor = TextPreprocessor()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_df = pd.read_csv('train.csv')
train_texts = train_df['text'].apply(preprocessor.clean_text)
train_labels = train_df['label']


X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels)

X_train_tokenized = tokenizer(X_train.tolist(), truncation = True, padding = True, max_length = 300, return_tensors = "tf")
X_test_tokenized = tokenizer(X_test.tolist(), truncation = True, padding = True, max_length = 300, return_tensors = "tf")

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from transformers import AdamWeightDecay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Initialize the Hugging Face model
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

for layer in model.layers:
    if layer.name != "classifier":
        layer.trainable = False


# Define the optimizer
optimizer = AdamWeightDecay(learning_rate=1e-5, weight_decay_rate=0.01)

model.compile(optimizer=optimizer, 
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])
# Train the model
training = model.fit(
    X_train_tokenized,
    y_train,
    batch_size=32,
    epochs=5,
    validation_split=0.2
)

y_test_pred_logits = model.predict(X_test_tokenized).logits
y_test_pred = np.argmax(y_test_pred_logits, axis = 1)  
print("Results on Test Data:")
print("Accuracy:",  accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print("Recall:", recall_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print("F1 Score:", f1_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print()

print(classification_report(y_test, y_test_pred))

pd.DataFrame(training.history).to_csv("./results/bert.csv", index = False)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results on Test Data:
Accuracy: 0.6045673076923077
Precision: 0.6518810299712648
Recall: 0.6049683211711563
F1 Score: 0.5716626071884093

              precision    recall  f1-score   support

           0       0.57      0.88      0.69      2077
           1       0.74      0.33      0.45      2083

    accuracy                           0.60      4160
   macro avg       0.65      0.60      0.57      4160
weighted avg       0.65      0.60      0.57      4160



In [32]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from transformers import AdamWeightDecay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Initialize the Hugging Face model
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Freeze 6 layers of encoder
for layer in model.bert.encoder.layer[0:6]:
    layer.trainable = False


# Define the optimizer
optimizer = AdamWeightDecay(learning_rate=1e-5, weight_decay_rate=0.01)

model.compile(optimizer=optimizer, 
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])
# Train the model
training = model.fit(
    X_train_tokenized,
    y_train,
    batch_size=32,
    epochs=5,
    validation_split=0.2
)

y_test_pred_logits = model.predict(X_test_tokenized).logits
y_test_pred = np.argmax(y_test_pred_logits, axis = 1)  
print("Results on Test Data:")
print("Accuracy:",  accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print("Recall:", recall_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print("F1 Score:", f1_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print()

print(classification_report(y_test, y_test_pred))

pd.DataFrame(training.history).to_csv("./results/bert2.csv", index = False)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Results on Test Data:
Accuracy: 0.9814903846153846
Precision: 0.9815049811617318
Recall: 0.9814948533315644
F1 Score: 0.9814903322061228

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2077
           1       0.98      0.98      0.98      2083

    accuracy                           0.98      4160
   macro avg       0.98      0.98      0.98      4160
weighted avg       0.98      0.98      0.98      4160



In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_test_pred_logits = model.predict(X_test_tokenized).logits
y_test_pred = np.argmax(y_test_pred_logits, axis = 1)  
print("Results on Test Data:")
print("Accuracy:",  accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print("Recall:", recall_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print("F1 Score:", f1_score(y_test, y_test_pred, average = "macro", zero_division = 0))
print()

print(classification_report(y_test, y_test_pred))

Results on Test Data:
Accuracy: 0.4206730769230769
Precision: 0.3942262643853256
Recall: 0.4203147611947233
F1 Score: 0.38236740760892596

              precision    recall  f1-score   support

           0       0.34      0.17      0.23      2077
           1       0.45      0.67      0.54      2083

    accuracy                           0.42      4160
   macro avg       0.39      0.42      0.38      4160
weighted avg       0.39      0.42      0.38      4160

