
## LSTM Model

In [None]:
# Download necessary libraries
# !pip install transformers

# Importing necessary libraries 
import os
import re
import pandas as pd
import numpy as np
import string
import pandas as pd
import pickle
import gc
from tqdm import tqdm
from os import name
gc.collect()

# Model Creation and testing
import torch
from torch.utils import data
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.modules import padding
from torch.nn.modules.activation import Sigmoid


# Sklearn 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

# For GPU specific run
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [5]:
# Constructing vocabulary using Bag-of-words
def construct_vocabulary(filename):
    data = pd.read_csv(filename)
    data.dropna(inplace=True)
    sentences = " ".join(data["new_sentence"].values).lower()
    unique_word = list(set(sentences.split()))
    print(f"Vocabulary: {len(unique_word)}")
    
    def store_pickle(data, name):
        pickle.dump(data,open(name, "wb"))
    # storing the dataset:
    store_pickle(unique_word, "data/unique_vocab.pickle")

# Split dataset into train and test Train size = 0.7, Test_size = 0.3
def splittFunction(filename):
    df = pd.read_csv(filename)[:20000]
    df.dropna(inplace=True)
    train_x, test_x, train_y, test_y = train_test_split(df["new_sentence"], df["Labels"], test_size=0.3, stratify=df["Labels"])
    train = pd.concat([train_x, train_y], axis=1)
    test  = pd.concat([test_x,  test_y], axis=1)
    train.to_csv("data/train_LSTM.csv", index=False)
    test.to_csv("data/test_LSTM.csv", index=False)


splittFunction("data/new_dataset_LSTM.csv")
construct_vocabulary("data/train_LSTM.csv")


Vocabulary: 125341


In [6]:
# Creating sequential learning dataset for LSTM model training
class dataset(Dataset):
    def __init__(self, pickleFile: str, filename: str   ) -> None:
        super().__init__()
        self.pickleFileName = pickleFile
        self.filename   = filename
        self.data = pd.read_csv(self.filename)
        self.unique_vocab = self._loadPickle(self.pickleFileName)
        self.pad_to_maxlength = 100
        self.cnt = 0

    def _loadPickle(self, filename):
        return pickle.load(open(filename,"rb"))
    
    def _convertNumeric(self, data):
        subsentence = []
        splitted_word = data.split()
        for word in splitted_word:
            if word in self.unique_vocab:
                subsentence.append(1)
            else:
                subsentence.append(0)
            

        if len(subsentence) < self.pad_to_maxlength:
            remaining_length = self.pad_to_maxlength - len(subsentence)
            subsentence = subsentence +[5]*remaining_length
        elif len(subsentence) > self.pad_to_maxlength:
            subsentence = subsentence[:self.pad_to_maxlength]

        assert len(subsentence)  == self.pad_to_maxlength
        return subsentence


    def __getitem__(self, index):
        subdata =self.data.iloc[index]
        sent, label = subdata["new_sentence"], subdata["Labels"]
        numeric_sent = self._convertNumeric(sent)
        self.cnt +=1
        return {
            "input_ids": numeric_sent,
            "label":  label
        }

    def __len__(self):
        return len(self.data)


def collate_fn(data):
    X, Y = [], []
    for dd in data:
        input_ids = dd["input_ids"]
        label    = dd["label"]
        X.append(input_ids)
        Y.append(label)
    
    return {
        "input_ids": torch.tensor(X),
        "label": torch.tensor(Y)
    }

train_dataset = dataset("data/unique_vocab.pickle", "data/train.csv")
# call the dataloader 
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=collate_fn)



In [7]:
# Creating LSTM model
class MODEL(nn.Module):
    def __init__(self,  num_of_embeddings: int, \
        output_class: int, embedding_dim: int = 128, padding_idx: int = 5, hidden_size: int= 128 ) -> None:
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings = num_of_embeddings, embedding_dim = embedding_dim)
        self.dropout = nn.Dropout()
        #lstm layer 
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size=hidden_size)
     
        # dense layer 
        self.relu = nn.ReLU()
        self.linear = nn.Linear(in_features= hidden_size, out_features= hidden_size)
        self.output = nn.Linear(in_features= hidden_size, out_features= output_class)
    
    # forward layer loop 
    def forward(self,x):
        embedding_output = self.embeddings(x)
        embedding_output = self.dropout(embedding_output)
        output, (hn, cn) = self.lstm(embedding_output)
        output = torch.mean(output,dim=1)
        # apply linear layer 
        output = self.relu(self.linear(output))
        output = self.output(output)
        return output, torch.sigmoid(output)

In [None]:
# Examine F1_score and accuracy of the model
def f1Score(prediction, gt):
    threshold =0.5
    y_pred = [1 if out.item() > threshold else 0 for out in prediction]
    return f1_score(y_pred, gt.detach().cpu().numpy().tolist())
    
def accuracy(prediction, gt):
    threshold =0.5
    y_pred = [1 if out.item() > threshold else 0 for out in prediction]
    return accuracy_score(y_pred, gt.detach().cpu().numpy().tolist())

# Model Training Loop
def main_loop():
    num_of_embeddings = len(pickle.load(open("data/unique_vocab.pickle", "rb")))+1
    output_class  = 1
    model = MODEL(num_of_embeddings, output_class)
    model.to(device)

    # defining the loss function 
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-5)

    # call the dataset:
    train_dataset = dataset("data/unique_vocab.pickle", "data/train_LSTM.csv")
    test_dataset  = dataset("data/unique_vocab.pickle", "data/test_LSTM.csv")

    # call the dataloader 
    trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=collate_fn)
    testloader  = DataLoader(test_dataset, batch_size=16, shuffle=True, drop_last=True, collate_fn = collate_fn)

    for ep in tqdm(range(10)): #100 iterations
        model.train()
        total_loss = 0.0
        train_f1 = []
        train_acc = []
        for idx, data in tqdm(enumerate(trainloader)):
            optimizer.zero_grad()
            X, Y = data["input_ids"].to(device), data["label"].to(device)
            prediction, sigmoid_pred  = model(X.long())
            loss = loss_fn(prediction.squeeze(), Y.float())
            total_loss += loss.item()
            loss.backward(retain_graph=True)
            optimizer.step()
            train_f1.append(f1Score(sigmoid_pred, Y))
            train_acc.append(accuracy(sigmoid_pred, Y))

        if ep%2 == 0: # result after every 20 episodes
            with torch.no_grad():
                model.eval()
                total_eval_loss = 0.0
                test_f1 = []
                test_acc = [] 
                for idx, data in tqdm(enumerate(testloader)):
                    X, Y = data["input_ids"].to(device), data["label"].to(device)
                    prediction, sigmoid_pred = model(X.long())
                    loss = loss_fn(prediction.squeeze(), Y.float())
                    total_eval_loss += loss.item()
                    test_f1.append(f1Score(sigmoid_pred, Y))
                    test_acc.append(accuracy(sigmoid_pred, Y))
            
            print(f"Train Loss {total_loss/len(trainloader)} Test Loss {total_eval_loss/len(testloader)}")
            print(f"Train Accuracy {np.array(train_acc).mean()} Test Accuracy {np.array(test_acc).mean()}")
            print(f"Train F1 {np.array(train_f1).mean()}  Test F1 {np.array(test_f1).mean()}")

    torch.save({
        "model_state": model.state_dict()  
    }, "lstm_classification.pth")
    
main_loop()

In [None]:
# ### Result
# # LSTM Model iteration result/ summary with model f1_score and accuracy 
# """
# 10%|█         | 1/10 [1:09:09<10:22:24, 4149.40s/it]Train Loss 0.6916796985060669 Test Loss 0.690889467201803
# Train Accuracy 0.5109738372093023 Test Accuracy 0.529891304347826
# Train F1 0.013253995266739165  Test F1 0.1597046899356682

#  30%|███       | 3/10 [3:05:54<7:15:13, 3730.53s/it]Train Loss 0.6891263084356175 Test Loss 0.6882554937316023
# Train Accuracy 0.5587209302325581 Test Accuracy 0.5670855978260869
# Train F1 0.2776830265385991  Test F1 0.30498007382997794

# 50%|█████     | 5/10 [5:01:10<5:02:27, 3629.57s/it]Train Loss 0.6861516629540643 Test Loss 0.6854808743557205
# Train Accuracy 0.5723837209302326 Test Accuracy 0.567764945652174
# Train F1 0.34581146319766193  Test F1 0.32787235783054536

# 70%|███████   | 7/10 [6:57:09<3:00:20, 3606.73s/it]Train Loss 0.683653856709946 Test Loss 0.6834396180575308
# Train Accuracy 0.5734011627906976 Test Accuracy 0.5670855978260869
# Train F1 0.34622344949886935  Test F1 0.3257361990098563

# 90%|█████████  | 9/10 [7:48:09<1:08:28, 3582.77s/it]Train Loss 0.6812515148610264 Test Loss 0.6823069397065198
# Train Accuracy 0.56940980473025118 Test Accuracy 0.5658443697202368
# Train F1 0.34651397643247641  Test F1 0.3229318480364815
# """