In [23]:
%pip install pytorch-lightning -q -U
!pip install emoji==1.5.0


import torch

import pandas as pd
import nltk
import numpy as np
from sklearn.model_selection import train_test_split

nltk.download('punkt')

import re
import pickle
import emoji


import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from sklearn.metrics import classification_report



[0mNote: you may need to restart the kernel to use updated packages.
[0m[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Intersection code, the DataFrame should be empty

In [24]:
df_test = pd.read_csv("/kaggle/input/testdatadeslab/hindi_test.csv")
df_train = pd.read_csv("/kaggle/input/traindatadeslab/hindi_train_val.csv")

df_test = df_test.merge(df_train, on="text")
df_test

Unnamed: 0,label_x,text,label_y


In [25]:
TEST = True
file = "/kaggle/input/testdatadeslab/hindi_test.csv"

# TEST = False
# file = "/kaggle/input/traindatadeslab/hindi_train_val.csv"

In [26]:
word_to_index = {"<PAD>": 0, "<UNK>": 1}
SEQ_LEN = 20


EMBEDDING_DIM = 512
HIDDEN_DIM    = 256
NUM_EPOCHS    = 100
BATCH_SIZE    = 32

# Helper Function

In [27]:

def extract_emojis(s):
    output = ''.join((' '+c+' ') if c in emoji.UNICODE_EMOJI['en'] else c for c in s)
    output = emoji.demojize(output).replace(":", "")
    return output

def word_mapping_train(sentence):
    padding = np.zeros(SEQ_LEN)
    mapping = []
    for word in sentence:
        try:
            mapping.append(word_to_index[word])
        except:
            word_to_index[word] = len(word_to_index)
            mapping.append(word_to_index[word])
    mapping = np.array(mapping[:SEQ_LEN])
    mapping = np.pad(mapping, (0, SEQ_LEN-len(mapping)), 'constant', constant_values=(0, 0))    

    return mapping

def word_mapping_test(sentence):
    padding = np.zeros(SEQ_LEN)
    mapping = []
    for word in sentence:
        try:
            mapping.append(word_to_index[word])
        except:
            mapping.append(1)
    mapping = np.array(mapping[:SEQ_LEN])
    mapping = np.pad(mapping, (0, SEQ_LEN-len(mapping)), 'constant', constant_values=(0,0))    

    return mapping


# Preprocessing

In [28]:
def load_data(path):
    df = pd.read_csv(path)
    return df

def process_data(df, test=False):
    df["text"] = df["text"].apply(extract_emojis)
    df["text"] = df["text"].apply(nltk.word_tokenize)
    if not test:
        df["text"] = df["text"].apply(word_mapping_train)
    else:
        df["text"] = df["text"].apply(word_mapping_test)


    X = torch.tensor(list(df["text"].values))
    
    Y = torch.tensor(list(df["label"]))
    return X, Y

# Model

In [29]:


class ATDModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=False)
        self.dropout1 = nn.Dropout()
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.batch_norm1 = nn.BatchNorm1d(num_features=hidden_dim)

        self.dropout2 = nn.Dropout(0.8)
        self.fc2 = nn.Linear(hidden_dim, tagset_size)
        self.batch_norm2 = nn.BatchNorm1d(num_features=tagset_size)

        self.loss_fn = nn.BCELoss()
    
    def forward(self, x):
        
        embeds = self.embedding(x)

        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc1(lstm_out[:, -1, :])
        tag_space = self.fc2(tag_space)
        tag_scores = nn.functional.sigmoid(tag_space)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)


        y = y.type(torch.FloatTensor)
        y_hat = y_hat.type(torch.FloatTensor)

        loss = self.loss_fn(y_hat.view(-1), y.view(-1))
        # loss = self.loss_fn(y_hat, y.unsqueeze(dim=1))

        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)


        y = y.type(torch.FloatTensor)
        y_hat = y_hat.type(torch.FloatTensor)

        loss = self.loss_fn(y_hat.view(-1), y.view(-1))
        # loss = self.loss_fn(y_hat, y.unsqueeze(dim=1))

        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)

        y = y.type(torch.FloatTensor)

        y_hat = y_hat.type(torch.FloatTensor)
        loss = self.loss_fn(y_hat.view(-1), y.view(-1))
        # loss = self.loss_fn(y_hat, y.unsqueeze(dim=1))

        self.log('test_loss', loss)
        return loss
    
  
    def configure_optimizers(self):
            optimizer = optim.Adam(self.parameters())
            return optimizer

# Dataloaders

In [30]:
def train_dataloader():
    df = load_data(file)
    X, y = process_data(df)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y,  random_state=42, stratify = y)
    
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    val_dataset = TensorDataset(X_val, y_val)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    return train_loader, val_loader

def test_dataloader():
    df = load_data(file)
    X, y = process_data(df, test=True)
    
    test_dataset = TensorDataset(X, y)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    
    return test_loader

# Traning

In [31]:
def traning():
    train_loader, val_loader = train_dataloader()
    
    model = ATDModel(vocab_size=len(word_to_index), tagset_size=1, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)
    early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
    trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping], accelerator='gpu', devices=1)
    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
    
    pickle.dump(model, open("lstm_model.sav", 'wb'))
    pickle.dump(word_to_index, open("word_to_index.sav", 'wb'))
    
    print("Model Saved")
    

# Testing

In [32]:
def testing():
    global word_to_index
    word_to_index = pickle.load(open("word_to_index.sav", 'rb'))
    model  = pickle.load(open("lstm_model.sav", 'rb'))
    
    test_loader = test_dataloader()
    
    model.eval()

    y_true = []
    y_pred = []

    with torch.no_grad():
        for x,y in test_loader:
            
            # Forward pass
            y_hat = model(x)
            y_hat = y_hat > 0.5
            # Compute the predicted tags
            y_pred += y_hat.tolist()

            #Compute the true tags
            y_true += y.tolist()
    
    y_pred_df = pd.Series(y_pred)
    result_csv = y_pred_df.to_csv("resultLSTM.csv", index=False)
    print("Result saved in resultLSTM.csv")
    print(classification_report(y_true, y_pred))

In [33]:
if not TEST:
    traning()
else:
    testing()
    



Result saved in resultLSTM.csv
              precision    recall  f1-score   support

           0       0.79      0.76      0.77      3496
           1       0.75      0.78      0.76      3232

    accuracy                           0.77      6728
   macro avg       0.77      0.77      0.77      6728
weighted avg       0.77      0.77      0.77      6728

