In [24]:
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf

In [25]:
train_df = pd.read_csv('/content/drive/MyDrive/College/Semester 5/NLP Project/NLP_Project-main/NLP_Project-main/Data/PreprocessedData/train_preprocessed.csv')

In [26]:
val_df = pd.read_csv('/content/drive/MyDrive/College/Semester 5/NLP Project/NLP_Project-main/NLP_Project-main/Data/PreprocessedData/val_preprocessed.csv')

In [27]:
train_X = train_df['preprocessed_text'].to_list()
train_Y = (train_df['label'].replace('OFF',1)).replace('NOT', 0).to_list()

In [28]:
val_X = val_df['preprocessed_text'].to_list()
val_Y = (val_df['label'].replace('OFF',1)).replace('NOT', 0).to_list()

In [29]:
VOCAB_SIZE = 10000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_X)

In [40]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=128,
        mask_zero=True),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [41]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(0.0001),
              metrics=['accuracy'])

In [42]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, None, 128)         1280000   
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 64)                12352     
                                                                 
 dense_8 (Dense)             (None, 64)                4160      
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,296,577
Trainable params: 1,296,577
Non-trainable params: 0
____________________________________________

In [43]:
history = model.fit(x = train_X, y = train_Y, epochs=10,
                    validation_data=(val_X, val_Y),
                    validation_steps=100)

Epoch 1/10



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [44]:
train_pred = model.predict(train_X)
val_pred = model.predict(val_X)



In [45]:
train_pred = np.where(train_pred > 0.5, 1, 0)
val_pred = np.where(val_pred > 0.5, 1, 0)

In [46]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

def computeAllScores(y_pred_train, y_pred_dev, train_labels, dev_labels):
    print("Accuracy Train: ", accuracy_score(train_labels, y_pred_train))
    print("Accuracy Dev: ", accuracy_score(dev_labels, y_pred_dev))
    print("Weighted F1 Train: ", f1_score(train_labels, y_pred_train, average='weighted'))
    print("Weighted F1 Dev: ", f1_score(dev_labels, y_pred_dev, average='weighted'))
    print("Macro F1 Train: ", f1_score(train_labels, y_pred_train, average='macro'))
    print("Macro F1 Dev: ", f1_score(dev_labels, y_pred_dev, average='macro'))
    print("Micro F1 Train: ", f1_score(train_labels, y_pred_train, average='micro'))
    print("Micro F1 Dev: ", f1_score(dev_labels, y_pred_dev, average='micro'))
    print("Weighted Recall Train: ", recall_score(train_labels, y_pred_train, average='weighted'))
    print("Weighted Recall Dev: ", recall_score(dev_labels, y_pred_dev, average='weighted'))
    print("Macro Recall Train: ", recall_score(train_labels, y_pred_train, average='macro'))
    print("Macro Recall Dev: ", recall_score(dev_labels, y_pred_dev, average='macro'))
    print("Micro Recall Train: ", recall_score(train_labels, y_pred_train, average='micro'))
    print("Micro Recall Dev: ", recall_score(dev_labels, y_pred_dev, average='micro'))
    # Confusion Matrix
    print("Confusion Matrix Train: ")
    print(confusion_matrix(train_labels, y_pred_train))
    print("Confusion Matrix Dev: ")
    print(confusion_matrix(dev_labels, y_pred_dev))

In [47]:
computeAllScores(train_pred, val_pred, train_Y, val_Y)

Accuracy Train:  0.9910309667673716
Accuracy Dev:  0.7050604229607251
Weighted F1 Train:  0.9910206913928981
Weighted F1 Dev:  0.6999076134016204
Macro F1 Train:  0.989820129897951
Macro F1 Dev:  0.6632554549725104
Micro F1 Train:  0.9910309667673716
Micro F1 Dev:  0.7050604229607251
Weighted Recall Train:  0.9910309667673716
Weighted Recall Dev:  0.7050604229607251
Macro Recall Train:  0.9887099610201029
Macro Recall Dev:  0.6591150883366599
Micro Recall Train:  0.9910309667673716
Micro Recall Dev:  0.7050604229607251
Confusion Matrix Train: 
[[7075   32]
 [  63 3422]]
Confusion Matrix Dev: 
[[1400  333]
 [ 448  467]]


In [None]:
val_pred = np.where(val_pred > 0.5, 1, 0)

In [None]:
from sklearn.metrics import f1_score

f1_score(val_Y, val_pred, average='macro')

0.6926490155178788

In [None]:
len(train_X)/len(val_X)

4.0

In [None]:
len(val_X)

2648