### proof of concept project for sequential to classification

## Dataset Information

#### [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/competitions/nlp-getting-started/data)

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
import random
import re
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
word_token_vector_size = 100

## LSTM Classifier based with bidirectional=False and num_layers=1

### Model definition

In [3]:
class LSTMClassifierNetwork(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes, bidirectional):
        super().__init__()
        self.num_classes = num_classes
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
        self.linear_input_size = hidden_size * 2 if self.bidirectional else hidden_size
        self.linear_net = nn.Sequential(
            nn.Linear(self.linear_input_size, 16),
            nn.LeakyReLU(),
            nn.Linear(16, 8),
            nn.LeakyReLU(),
            nn.Linear(8, self.num_classes),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        out, (h, c) = self.lstm(x)
        if self.bidirectional:
            h_reshaped_bidirectional = h.reshape(x.shape[0], self.linear_input_size)
            res = self.linear_net(h_reshaped_bidirectional)
            return res

        else:
            res = self.linear_net(h.squeeze(0))
            return res


### Model initialization

In [4]:
lcn = LSTMClassifierNetwork(input_size=word_token_vector_size, hidden_size=16, num_layers=1, num_classes=2, bidirectional=False)

### Text Preprocessing - Removing: words that start with symbols (@, #), words that start with links (http, https)

In [5]:
def remove_symbol_words(text):
    # Define the regex pattern to match words starting with symbols
    pattern = r"[^\w\s]"  # Matches any character that is not alphanumeric or whitespace
    result = re.sub(pattern, "", text)
    return result.lower()

def remove_http(sentence):
    words = sentence.split()
    return " ".join(["" if word.startswith("http") else word for word in words]).strip()

# Download stopwords list
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [6]:
df = pd.read_csv('./train.csv')
df.drop(columns=['keyword', 'location', 'id'], inplace=True)
remove_symbol_words('Our deeds are the Reason of this #earthquake.')

# removing symbols and links
cleaned_sentences = []
for sentence in df['text']:
    # cleaned_sentence = remove_stopwords(sentence)
    cleaned_sentence = remove_symbol_words(sentence)
    cleaned_sentence = remove_http(cleaned_sentence)
    cleaned_sentences.append(cleaned_sentence)

# for each cleaned sentence, form a list of words representing a sentence
# for eg: sentence 1 = ['today', 'we', 'went', 'to', 'the', 'beach']
corpus_sentences = []
for sentence in cleaned_sentences:
    split_sentence = sentence.split()
    corpus_sentences.append(split_sentence)


In [7]:
corpus_sentences[0]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'us',
 'all']

In [8]:
word2vec_model = gensim.models.Word2Vec(sentences=corpus_sentences, vector_size=word_token_vector_size)
# word2vec_model = gensim.models.Word2Vec.load('GoogleNews-vectors-negative300.bin.gz')

In [9]:
sentence_tensor_list = []
max_sentence_tensor_len = 0

# for each word list (sentence)
for sentence in corpus_sentences:
    sentence_vector_list = []
    print(f'WORKING ON SENTENCE {sentence}')

    # for each word in the word list
    for word in sentence:
        try:
            # each word vector
            # print(f'Converting word {word} to vector ')
            # convert each word to vector format using word2vec
            word_vector = word2vec_model.wv[word].copy()

            # list of word vectors, forming list of tensors / vectors for a sentence
            sentence_vector_list.append(torch.from_numpy(word_vector))
        except:
            pass
    
    # each sentence_tensor => list of tensors
    # i.e each sentence now is a list of tensors
    sentence_tensor = torch.from_numpy(np.array(sentence_vector_list))

    if sentence_tensor.shape[0] > max_sentence_tensor_len:
        max_sentence_tensor_len = sentence_tensor.shape[0]

    # and each sentence_tensor is stored in a parent list => sentence_tensor_list
    sentence_tensor_list.append(sentence_tensor)

WORKING ON SENTENCE ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']
WORKING ON SENTENCE ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']
WORKING ON SENTENCE ['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']
WORKING ON SENTENCE ['13000', 'people', 'receive', 'wildfires', 'evacuation', 'orders', 'in', 'california']
WORKING ON SENTENCE ['just', 'got', 'sent', 'this', 'photo', 'from', 'ruby', 'alaska', 'as', 'smoke', 'from', 'wildfires', 'pours', 'into', 'a', 'school']
WORKING ON SENTENCE ['rockyfire', 'update', 'california', 'hwy', '20', 'closed', 'in', 'both', 'directions', 'due', 'to', 'lake', 'county', 'fire', 'cafire', 'wildfires']
WORKING ON SENTENCE ['flood', 'disaster', 'heavy', 'rain', 'causes', 'flash', 'flooding', 'of', 'streets', 'in', 'manitou', 'colorado', 'sp

In [10]:
# each sentence_tensor element has a shape of [x, 100], where x denotes the number of words in that sentence
# however, with varying sequence lengths, we cannot harness the power of vectorisation during training and we would have to
# iterate over every sentence_tensor individually
sentence_tensor_list[0].shape

torch.Size([11, 100])

In [11]:
# we create a pad tensor, which is a buffer word, to pad the sentence sequence length to the max seen sentence sequence length
pad_tensor = torch.zeros(1, word_token_vector_size)

In [12]:
padded_sentence_tensor_tensor = []

# for each sentence_tensor
for sentence_tensor in sentence_tensor_list:
    to_be_added = max_sentence_tensor_len - sentence_tensor.shape[0]

    # we repeatedly add the pad (buffer) tensor to make the length of each sentence tensor the same 
    # which is the max seen sequence length, which in this case is 31
    for i in range(to_be_added):
        sentence_tensor = torch.cat((sentence_tensor, torch.zeros(1, word_token_vector_size)))

    padded_sentence_tensor_tensor.append(sentence_tensor)

# converting to tensor of sentence_tensor from list of sentence_tensor
padded_sentence_tensor_tensor = torch.cat([t.unsqueeze(0) for t in padded_sentence_tensor_tensor], dim=0)
print(padded_sentence_tensor_tensor.shape)


torch.Size([7613, 31, 100])


#### We now have a batch of 7613 sentences, with 31 words of vector length 100

### Assigning X and Y , their respective values for training

In [13]:
X = padded_sentence_tensor_tensor
Y = torch.from_numpy(df['target'].to_numpy())

### Splitting into train test splits

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, train_size=0.9)

In [15]:
X.shape[0]

7613

In [16]:
# defining the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lcn.parameters(), lr=3e-4)

### Training loop

In [17]:
epochs = 5000
for i in range(epochs):
    loss = loss_fn(lcn(X_train.float()), Y_train)
    print(f'Epoch {i + 1} : Loss {loss}')
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 1 : Loss 0.6851736903190613
Epoch 2 : Loss 0.6851325035095215
Epoch 3 : Loss 0.6850917935371399
Epoch 4 : Loss 0.6850513815879822
Epoch 5 : Loss 0.6850115060806274
Epoch 6 : Loss 0.6849719882011414
Epoch 7 : Loss 0.6849336624145508
Epoch 8 : Loss 0.6848958134651184
Epoch 9 : Loss 0.6848583817481995
Epoch 10 : Loss 0.6848214864730835
Epoch 11 : Loss 0.6847848296165466
Epoch 12 : Loss 0.6847488284111023
Epoch 13 : Loss 0.6847133636474609
Epoch 14 : Loss 0.684678316116333
Epoch 15 : Loss 0.6846437454223633
Epoch 16 : Loss 0.6846096515655518
Epoch 17 : Loss 0.6845762133598328
Epoch 18 : Loss 0.6845433115959167
Epoch 19 : Loss 0.6845112442970276
Epoch 20 : Loss 0.6844804286956787
Epoch 21 : Loss 0.6844501495361328
Epoch 22 : Loss 0.6844202876091003
Epoch 23 : Loss 0.6843908429145813
Epoch 24 : Loss 0.6843618154525757
Epoch 25 : Loss 0.6843333840370178
Epoch 26 : Loss 0.6843051910400391
Epoch 27 : Loss 0.6842774748802185
Epoch 28 : Loss 0.6842502951622009
Epoch 29 : Loss 0.684223532676

In [18]:
X_train.shape[0]

6851

In [19]:
Y_train.shape[0]

6851

In [20]:
df['target'].shape

(7613,)

In [21]:

def accuracy(output, target):
    """
    Computes the accuracy for a 2-class classification problem.
    
    Parameters:
    - output (torch.Tensor): Model outputs in softmax format of shape (batch_size, 2).
    - target (torch.Tensor): True labels of shape (batch_size,) with values 0 or 1.
    
    Returns:
    - accuracy (float): The accuracy of the predictions.
    """
    # Get the predicted classes from the softmax output
    pred = torch.argmax(output, dim=1)
    
    # Compare predicted classes to the true labels
    correct = (pred == target).sum().item()
    
    # Calculate accuracy
    accuracy = correct / target.size(0)
    
    return accuracy


### We achieve an accuracy of ~73% in our first pass without any hyperparameter tuning / optimization

In [26]:
# train accuracy
accuracy(lcn(X_train.float()), Y_train)

0.7734637279229309

In [27]:
# test accuracy
accuracy(lcn(X_test.float()), Y_test)

0.7322834645669292