### proof of concept project for sequential to classification

## Dataset Information

#### [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/competitions/nlp-getting-started/data)

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
import random
import re
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim.downloader as api

# Load the model
model = api.load('word2vec-google-news-300')


In [2]:
word_token_vector_size = 300

## LSTM Classifier based with bidirectional=False and num_layers=1

### Model definition

In [3]:
class LSTMClassifierNetwork(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, num_classes, bidirectional):
        super().__init__()
        self.num_classes = num_classes
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
        self.linear_input_size = hidden_size * 2 if self.bidirectional else hidden_size
        self.linear_net = nn.Sequential(
            nn.Linear(self.linear_input_size, 16),
            nn.LeakyReLU(),
            nn.Linear(16, 8),
            nn.LeakyReLU(),
            nn.Linear(8, self.num_classes),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        out, (h, c) = self.lstm(x)
        if self.bidirectional:
            h_reshaped_bidirectional = h.reshape(x.shape[0], self.linear_input_size)
            res = self.linear_net(h_reshaped_bidirectional)
            return res

        else:
            res = self.linear_net(h.squeeze(0))
            return res


### Model initialization

In [4]:
lcn = LSTMClassifierNetwork(input_size=word_token_vector_size, hidden_size=16, num_layers=1, num_classes=2, bidirectional=False)

### Text Preprocessing - Removing: words that start with symbols (@, #), words that start with links (http, https)

In [5]:
def remove_symbol_words(text):
    # Define the regex pattern to match words starting with symbols
    pattern = r"[^\w\s]"  # Matches any character that is not alphanumeric or whitespace
    result = re.sub(pattern, "", text)
    return result.lower()

def remove_http(sentence):
    words = sentence.split()
    return " ".join(["" if word.startswith("http") else word for word in words]).strip()

# Download stopwords list
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [6]:
df = pd.read_csv('./train.csv')
df.drop(columns=['keyword', 'location', 'id'], inplace=True)
remove_symbol_words('Our deeds are the Reason of this #earthquake.')

# removing symbols and links
cleaned_sentences = []
for sentence in df['text']:
    cleaned_sentence = remove_stopwords(sentence)
    cleaned_sentence = remove_symbol_words(cleaned_sentence)
    cleaned_sentence = remove_http(cleaned_sentence)
    cleaned_sentences.append(cleaned_sentence)

# for each cleaned sentence, form a list of words representing a sentence
# for eg: sentence 1 = ['today', 'we', 'went', 'to', 'the', 'beach']
corpus_sentences = []
for sentence in cleaned_sentences:
    split_sentence = sentence.split()
    corpus_sentences.append(split_sentence)


In [7]:
cleaned_sentences

['deeds reason earthquake may allah forgive us',
 'forest fire near la ronge sask canada',
 'residents asked shelter place notified officers evacuation shelter place orders expected',
 '13000 people receive wildfires evacuation orders california',
 'got sent photo ruby alaska smoke wildfires pours school',
 'rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires',
 'flood disaster heavy rain causes flash flooding streets manitou colorado springs areas',
 'm top hill see fire woods',
 's emergency evacuation happening building across street',
 'm afraid tornado coming area',
 'three people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gon na gon na fvck flooding',
 'raining flooding florida tampabay tampa 18 19 days ve lost count',
 'flood bago myanmar arrived bago',
 'damage school bus 80 multi car crash breaking',
 's man',
 'love fruits',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 '

In [8]:
word2vec_model = gensim.models.Word2Vec(sentences=corpus_sentences, vector_size=word_token_vector_size)
# word2vec_model = gensim.models.Word2Vec.load('GoogleNews-vectors-negative300.bin.gz')

In [9]:
sentence_tensor_list = []
max_sentence_tensor_len = 0

# for each word list (sentence)
for sentence in corpus_sentences:
    sentence_vector_list = []
    print(f'WORKING ON SENTENCE {sentence}')

    # for each word in the word list
    for word in sentence:
        try:
            # each word vector
            # print(f'Converting word {word} to vector ')
            # convert each word to vector format using word2vec
            word_vector = model[word].copy()

            # list of word vectors, forming list of tensors / vectors for a sentence
            sentence_vector_list.append(torch.from_numpy(word_vector))
        except:
            pass

    # each sentence_tensor => list of tensors
    # i.e each sentence now is a list of tensors
    sentence_tensor = torch.from_numpy(np.array(sentence_vector_list))

    if sentence_tensor.shape[0] > max_sentence_tensor_len:
        max_sentence_tensor_len = sentence_tensor.shape[0]

    # and each sentence_tensor is stored in a parent list => sentence_tensor_list
    sentence_tensor_list.append(sentence_tensor)

WORKING ON SENTENCE ['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']
WORKING ON SENTENCE ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']
WORKING ON SENTENCE ['residents', 'asked', 'shelter', 'place', 'notified', 'officers', 'evacuation', 'shelter', 'place', 'orders', 'expected']
WORKING ON SENTENCE ['13000', 'people', 'receive', 'wildfires', 'evacuation', 'orders', 'california']
WORKING ON SENTENCE ['got', 'sent', 'photo', 'ruby', 'alaska', 'smoke', 'wildfires', 'pours', 'school']
WORKING ON SENTENCE ['rockyfire', 'update', 'california', 'hwy', '20', 'closed', 'directions', 'due', 'lake', 'county', 'fire', 'cafire', 'wildfires']
WORKING ON SENTENCE ['flood', 'disaster', 'heavy', 'rain', 'causes', 'flash', 'flooding', 'streets', 'manitou', 'colorado', 'springs', 'areas']
WORKING ON SENTENCE ['m', 'top', 'hill', 'see', 'fire', 'woods']
WORKING ON SENTENCE ['s', 'emergency', 'evacuation', 'happening', 'building', 'across', 'street']
WORKING ON SENTENCE ['m', 

In [10]:
# each sentence_tensor element has a shape of [x, 100], where x denotes the number of words in that sentence
# however, with varying sequence lengths, we cannot harness the power of vectorisation during training and we would have to
# iterate over every sentence_tensor individually
sentence_tensor_list[0].shape

torch.Size([7, 300])

In [11]:
# we create a pad tensor, which is a buffer word, to pad the sentence sequence length to the max seen sentence sequence length
pad_tensor = torch.zeros(1, word_token_vector_size)

In [12]:
padded_sentence_tensor_tensor = []

# for each sentence_tensor
for sentence_tensor in sentence_tensor_list:
    to_be_added = max_sentence_tensor_len - sentence_tensor.shape[0]

    # we repeatedly add the pad (buffer) tensor to make the length of each sentence tensor the same 
    # which is the max seen sequence length, which in this case is 31
    for i in range(to_be_added):
        sentence_tensor = torch.cat((sentence_tensor, torch.zeros(1, word_token_vector_size)))

    padded_sentence_tensor_tensor.append(sentence_tensor)

# converting to tensor of sentence_tensor from list of sentence_tensor
padded_sentence_tensor_tensor = torch.cat([t.unsqueeze(0) for t in padded_sentence_tensor_tensor], dim=0)
print(padded_sentence_tensor_tensor.shape)


torch.Size([7613, 22, 300])


#### We now have a batch of 7613 sentences, with 31 words of vector length 100

### Assigning X and Y , their respective values for training

In [13]:
X = padded_sentence_tensor_tensor
Y = torch.from_numpy(df['target'].to_numpy())

### Splitting into train test splits

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, train_size=0.9)

In [15]:
X.shape[0]

7613

In [16]:
# defining the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lcn.parameters(), lr=3e-4)

### Training loop

In [17]:
epochs = 1000
for i in range(epochs):
    loss = loss_fn(lcn(X_train.float()), Y_train)
    print(f'Epoch {i + 1} : Loss {loss}')
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 1 : Loss 0.6841542720794678
Epoch 2 : Loss 0.6841447949409485
Epoch 3 : Loss 0.6841356158256531
Epoch 4 : Loss 0.6841266751289368
Epoch 5 : Loss 0.68411785364151
Epoch 6 : Loss 0.684109628200531
Epoch 7 : Loss 0.684101402759552
Epoch 8 : Loss 0.6840935349464417
Epoch 9 : Loss 0.6840859651565552
Epoch 10 : Loss 0.6840786337852478
Epoch 11 : Loss 0.6840718388557434
Epoch 12 : Loss 0.6840652227401733
Epoch 13 : Loss 0.6840588450431824
Epoch 14 : Loss 0.6840526461601257
Epoch 15 : Loss 0.6840466260910034
Epoch 16 : Loss 0.6840410232543945
Epoch 17 : Loss 0.6840357184410095
Epoch 18 : Loss 0.6840308308601379
Epoch 19 : Loss 0.6840262413024902
Epoch 20 : Loss 0.6840222477912903
Epoch 21 : Loss 0.6840192675590515
Epoch 22 : Loss 0.6840158104896545
Epoch 23 : Loss 0.6840123534202576
Epoch 24 : Loss 0.6840091347694397
Epoch 25 : Loss 0.6840059161186218
Epoch 26 : Loss 0.6840031147003174
Epoch 27 : Loss 0.6840000748634338
Epoch 28 : Loss 0.6839973330497742
Epoch 29 : Loss 0.683994889259338

In [18]:
# X_train.shape[0]

In [19]:
# Y_train.shape[0]

In [20]:
# df['target'].shape

In [21]:

def accuracy(output, target):
    """
    Computes the accuracy for a 2-class classification problem.
    
    Parameters:
    - output (torch.Tensor): Model outputs in softmax format of shape (batch_size, 2).
    - target (torch.Tensor): True labels of shape (batch_size,) with values 0 or 1.
    
    Returns:
    - accuracy (float): The accuracy of the predictions.
    """
    # Get the predicted classes from the softmax output
    pred = torch.argmax(output, dim=1)
    
    # Compare predicted classes to the true labels
    correct = (pred == target).sum().item()
    
    # Calculate accuracy
    accuracy = correct / target.size(0)
    
    return accuracy


### We achieve an accuracy of ~80% in our first pass without any hyperparameter tuning / optimization

In [22]:
# train accuracy
accuracy(lcn(X_train.float()), Y_train)

0.8080572179243906

In [23]:
# test accuracy
accuracy(lcn(X_test.float()), Y_test)

0.7860892388451444