In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
!pip install tensorflow keras



In [29]:
import numpy as np
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Initialization

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import zipfile
import os
import re
import string

In [24]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
zip_file_path = '/content/drive/MyDrive/SEM7_SLP/sentence+classification.zip'

with zipfile.ZipFile(zip_file_path, 'r') as z:
    z.extractall('/content/extracted_data')

extracted_files = os.listdir('/content/extracted_data')
print("Extracted files:", extracted_files)

Extracted files: ['SentenceCorpus', '__MACOSX']


In [26]:
labeled_articles_dir = '/content/extracted_data/SentenceCorpus/labeled_articles'

In [27]:
data = []
for annot_file in os.listdir(labeled_articles_dir):
    file_path = os.path.join(labeled_articles_dir, annot_file)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        sentences = file.readlines()
        domain = annot_file.split('_')[0]
        for sentence in sentences:
            data.append([sentence.strip(), domain])

In [28]:
df = pd.DataFrame(data, columns=['text', 'domain'])
df.head()

Unnamed: 0,text,domain
0,### abstract ###,jdm
1,"MISC\tsimilar to research on risky choice, the...",jdm
2,MISC\tthe well-known allais paradox contradict...,jdm
3,AIMX\twe describe a violation of the law of di...,jdm
4,### introduction ###,jdm


#  Preprocessing

In [54]:
# Convert domain labels to integers
df['domain'] = df['domain'].astype('category').cat.codes

In [None]:
#print("Max length of an abstract:: ", len(max((x_train+x_test), key=len)))
#print("Min length of an abstract:: ", len(min((x_train+x_test), key=len)))

**num_words** : Limits the vocabulary size to the most frequent words in the dataset to reduce dimentionality and make model training more efficient

**max_words** : Defines the maximum length of the sequences (texts) after padding. Ensures that all input sequences (texts) have the same length, which is necessary for feeding them into neural networks. By padding shorter sequences and truncating longer ones, you standardize the input size.

In [45]:
def preprocess_data(texts, labels, num_words=5000, max_words=400):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    data = sequence.pad_sequences(sequences, maxlen=max_words)
    labels = np.array(labels)
    x_temp, x_test, y_temp, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.2/(1-0.2), random_state=42) #test_size = val_size/1-test_size

    return x_train, x_val, x_test, y_train, y_val, y_test, tokenizer.word_index

In [55]:
x_train, x_val, x_test, y_train, y_val, y_test, word_index = preprocess_data(df['text'], df['domain'])

In [37]:
padded_lengths = [len(seq) for seq in np.concatenate([x_train, x_test])]
print("Length of a padded sequence:: ", max(padded_lengths))

Length of a padded sequence::  400


# Text Classification

## Simple RNN

In [58]:
# fixing every word's embedding size to be 32 i.e. vector length
embd_len = 32
vocab_size = len(word_index) + 1  # Add 1 for padding token

In [57]:
len(set(df['domain']))

4

In [59]:
RNN_model = Sequential(name="Simple_RNN")

#embedding layer that converts integer-encoded vocabulary indices into dense vectors of fixed size.
RNN_model.add(Embedding(vocab_size,
                        embd_len))
RNN_model.add(SimpleRNN(128,
                        activation='tanh',
                        return_sequences=False))
RNN_model.add(Dense(units=4, activation='softmax'))

print(RNN_model.summary())

None


**sparse_categorical_crossentropy**: This loss function is designed for integer-labeled classification problems. It expects the target labels to be integers, where each integer represents a class index. It efficiently handles multi-class classification tasks without needing to convert the labels to one-hot encoded vectors.

**categorical_crossentropy**: This loss function is used when the target labels are provided as one-hot encoded vectors. Each label is a vector where only one element is 1 (representing the class) and the rest are 0.

In [60]:
RNN_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [61]:
history = RNN_model.fit(x_train, y_train,
                        batch_size=64,
                        epochs=5,
                        verbose=1,
                        validation_data=(x_val, y_val))

Epoch 1/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 264ms/step - accuracy: 0.3457 - loss: 1.1934 - val_accuracy: 0.3712 - val_loss: 1.0534
Epoch 2/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 205ms/step - accuracy: 0.6075 - loss: 1.0276 - val_accuracy: 0.4894 - val_loss: 0.9758
Epoch 3/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 267ms/step - accuracy: 0.4338 - loss: 1.2161 - val_accuracy: 0.5652 - val_loss: 1.0114
Epoch 4/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 193ms/step - accuracy: 0.7997 - loss: 0.8370 - val_accuracy: 0.7833 - val_loss: 0.8328
Epoch 5/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 192ms/step - accuracy: 0.9164 - loss: 0.5109 - val_accuracy: 0.8742 - val_loss: 0.4411


In [62]:
print("Simple_RNN Score---> ", RNN_model.evaluate(x_test, y_test, verbose=0))

Simple_RNN Score--->  [0.45712900161743164, 0.8606060743331909]


**Hence, simple RNN gives us an accruracy of 86%.**

*Limitations of Simple RNN are it is unable to handle long sentences well because of its vanishing gradient problems.*

## Gated Recurrent Units (GRU)

In [63]:
gru_model = Sequential(name="GRU_Model")
gru_model.add(Embedding(vocab_size,
                        embd_len))
gru_model.add(GRU(128,
                  activation='tanh',
                  return_sequences=False))
gru_model.add(Dense(units=4, activation='softmax'))

print(gru_model.summary())

None


In [64]:
gru_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

In [66]:
history2 = gru_model.fit(x_train, y_train,
                         batch_size=64,
                         epochs=5,
                         verbose=1,
                         validation_data=(x_val, y_val))

Epoch 1/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1s/step - accuracy: 0.3409 - loss: 1.2932 - val_accuracy: 0.4030 - val_loss: 1.0688
Epoch 2/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 949ms/step - accuracy: 0.5122 - loss: 1.0237 - val_accuracy: 0.7727 - val_loss: 0.8642
Epoch 3/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 1s/step - accuracy: 0.8262 - loss: 0.7161 - val_accuracy: 0.8136 - val_loss: 0.5387
Epoch 4/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 1s/step - accuracy: 0.9275 - loss: 0.3392 - val_accuracy: 0.9227 - val_loss: 0.2272
Epoch 5/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1s/step - accuracy: 0.9583 - loss: 0.1343 - val_accuracy: 0.9364 - val_loss: 0.1823


In [67]:
print("GRU model Score---> ", gru_model.evaluate(x_test, y_test, verbose=0))

GRU model Score--->  [0.20839270949363708, 0.918181836605072]


**Hence, GRU produces an accuracy of 91.8%**

*GRU is a form of RNN that are better than simple RNN and are often faster than LSTM due to its relatively fewer training parameters.*

## LSTM

In [76]:
lstm_model = Sequential(name="LSTM_Model")
lstm_model.add(Embedding(vocab_size,
                         embd_len))
lstm_model.add(LSTM(128,
                    activation='tanh',
                    return_sequences=False))
lstm_model.add(Dense(units=4, activation='softmax'))

print(lstm_model.summary())

None


In [77]:
lstm_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

In [78]:
history3 = lstm_model.fit(x_train, y_train,
                         batch_size=64,
                         epochs=5,
                         verbose=1,
                         validation_data=(x_val, y_val))

Epoch 1/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 941ms/step - accuracy: 0.3534 - loss: 1.2715 - val_accuracy: 0.3652 - val_loss: 1.1013
Epoch 2/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 909ms/step - accuracy: 0.4849 - loss: 1.0515 - val_accuracy: 0.3848 - val_loss: 1.1333
Epoch 3/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - accuracy: 0.6356 - loss: 0.8441 - val_accuracy: 0.8682 - val_loss: 0.5071
Epoch 4/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 946ms/step - accuracy: 0.8984 - loss: 0.3779 - val_accuracy: 0.6833 - val_loss: 0.7209
Epoch 5/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1s/step - accuracy: 0.9287 - loss: 0.2101 - val_accuracy: 0.9485 - val_loss: 0.1386


In [82]:
print("LSTM model Score---> ", lstm_model.evaluate(x_test, y_test, verbose=0))

LSTM model Score--->  [0.15326866507530212, 0.9424242377281189]


**Hence, LSTM produces an accuracy of 94.2% in text classification**

LSTM is better at caputing sequential information in memory than SimpleRNN.

## Bidirectional LSTM

In [72]:
bi_lstm_model = Sequential(name="Bidirectional_LSTM")
bi_lstm_model.add(Embedding(vocab_size,
                            embd_len))
bi_lstm_model.add(Bidirectional(LSTM(128,
                                     activation='tanh',
                                     return_sequences=False)))
bi_lstm_model.add(Dense(4, activation='softmax'))

print(bi_lstm_model.summary())

In [73]:
bi_lstm_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

In [79]:
history4 = bi_lstm_model.fit(x_train, y_train,
                         batch_size=64,
                         epochs=5,
                         verbose=1,
                         validation_data=(x_val, y_val))

Epoch 1/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2s/step - accuracy: 0.3551 - loss: 1.2205 - val_accuracy: 0.4348 - val_loss: 1.0716
Epoch 2/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2s/step - accuracy: 0.4710 - loss: 1.0179 - val_accuracy: 0.6788 - val_loss: 0.7637
Epoch 3/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 2s/step - accuracy: 0.8228 - loss: 0.5436 - val_accuracy: 0.8864 - val_loss: 0.3678
Epoch 4/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2s/step - accuracy: 0.9503 - loss: 0.2109 - val_accuracy: 0.9273 - val_loss: 0.2525
Epoch 5/5
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2s/step - accuracy: 0.9520 - loss: 0.1190 - val_accuracy: 0.9318 - val_loss: 0.2350


In [80]:
print("Bidirectional LSTM model Score---> ",
      bi_lstm_model.evaluate(x_test, y_test, verbose=0))

Bidirectional LSTM model Score--->  [0.16641320288181305, 0.9363636374473572]


**Hence, Bidirectional LSTM produces an accuracy of 93.6%**

*Here, two LSTMs are used to capture both the forward and backward sequences of the input. This helps in capturing the context better than normal LSTM*

## Overall

LSTM has the highest accuracy for this dataset at 94.2%