- Importing Libraries

In [1]:
!git clone https://github.com/arnav10goel/CSE556-NLP-Project.git
%cd CSE556-NLP-Project

fatal: destination path 'CSE556-NLP-Project' already exists and is not an empty directory.
/kaggle/working/CSE556-NLP-Project


In [2]:
import torch

from transformers import BertModel

import pandas as pd

from torch.utils.data import Dataset, DataLoader

import pandas as pd

import torch

from transformers import BertTokenizer

import string

### Load Dataset for the Particular Language

- Here insert the language initial you want to load the train and test CSVs for

- Language Codes:

    1. Hindi: `hi`

    2. Bengali: `bn`

    3. Marathi: `mr`

    4. Tamil: `ta`

    5. Telugu: `te`

In [3]:
# Define file paths for two languages

languages = ['Hindi', 'Bengali']

train_files = [f"/kaggle/working/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_train.csv" for lang in languages]

test_files = [f"/kaggle/working/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_test.csv" for lang in languages]



# Concatenate train datasets for two languages

train_dfs = []

for train_file in train_files:

    train_dfs.append(pd.read_csv(train_file))

train_df = pd.concat(train_dfs, ignore_index=True)



# Concatenate test datasets for two languages

test_dfs = []

for test_file in test_files:

    test_dfs.append(pd.read_csv(test_file))

test_df = pd.concat(test_dfs, ignore_index=True)



print("Length of Concatenated Train Dataset:", len(train_df))

print("Length of Concatenated Test Dataset:", len(test_df))

Length of Concatenated Train Dataset: 8000
Length of Concatenated Test Dataset: 2000


## Custom Dataset Class

- This class can be used for all Sentiment Analysis fine-tuning and zero shot tasks

- For multiple languages, combine data into one dataframe as the dataset class takes DATAFRAME as input

In [4]:
class MultilingualSentimentAnalysis_Dataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_length=256):

        self.tokenizer = tokenizer

        self.dataframe = dataframe

        self.max_length = max_length



    def __len__(self):

        return len(self.dataframe)



    def __getitem__(self, idx):

        label = self.dataframe.iloc[idx]["Labels Set"]

        input_text = self.dataframe.iloc[idx]["Processed_Post"]

        if pd.isna(input_text):

            input_text = ""



        # Tokenize

        encoding = self.tokenizer.encode_plus(

            input_text, None,

            add_special_tokens=True,

            max_length=self.max_length,

            padding='max_length',

            truncation=True,

            return_tensors='pt'  # Corrected the argument here

        )



        return {

            'input_ids': encoding['input_ids'].flatten(),

            'attention_mask': encoding['attention_mask'].flatten(),

            'token_type_ids': encoding['token_type_ids'].flatten(),

            'labels': torch.tensor(label, dtype=torch.float)

        }


In [5]:
from transformers import AutoTokenizer, AutoModel
# Get the model name

model_name = "ai4bharat/indic-bert"


# Initialize IndicBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [6]:
# Initialise dataset instances

train_dataset = MultilingualSentimentAnalysis_Dataset(train_df, tokenizer)

dev_dataset = MultilingualSentimentAnalysis_Dataset(test_df, tokenizer)



# Initialise the dataloaders

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle=False)

In [7]:
import torch

import torch.nn as nn



class MBertForSentimentAnalysis(nn.Module):

    def __init__(self, freeze_bert=False):

        super(MBertForSentimentAnalysis, self).__init__()



        # Load mBERT model and tokenizer

        self.model_name = "ai4bharat/indic-bert"

        # tokenizer = BertTokenizer.from_pretrained(model_name)

        self.mbert = AutoModel.from_pretrained(self.model_name)



        # Add a batch normalization layer

        self.batch_norm = nn.BatchNorm1d(self.mbert.config.hidden_size)

        

        # Add a linear layer for classification

        self.classification = nn.Linear(self.mbert.config.hidden_size, 2)



        # Option to freeze MBERT layers to prevent them from being updated during training

        if freeze_bert:

            for param in self.mbert.parameters():

                param.requires_grad = False



    def forward(self, input_ids, attention_mask):

        # Get the output from BERT model

        _, pooled_outputs = self.mbert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)



        # Pass output through batch normalization layer

        pooled_outputs = self.batch_norm(pooled_outputs)

        

        # Pass output through linear layer

        out = self.classification(pooled_outputs)

        return out

- Pass freeze as true when doing zero shot to prevent BERT from getting fine-tuned

In [8]:
# Freeze BERT encoder to prevent fine-tuning

model = MBertForSentimentAnalysis()

if torch.cuda.is_available():

    device = torch.device("cuda")

    model.to(device)  # Move model to CUDA device if available

    print("Using CUDA")

else:

    device = torch.device("cpu")

    print("CUDA is not available. Using CPU instead.")

Using CUDA


In [9]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

mse_loss = torch.nn.CrossEntropyLoss()



num_epochs = 4



train_losses = []

val_losses = []



for epoch in range(num_epochs):

    # Training

    model.train()  

    train_loss = 0

    for batch in train_dataloader:

        # Forward pass

        inputs = {k: v.to(device) for k, v in batch.items()}

        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

        loss = mse_loss(outputs, inputs['labels'].long())



        # Backward pass and optimize

        loss.backward()

        optimizer.step()

        optimizer.zero_grad()



        train_loss += loss.item()



    # Validation

    model.eval()  

    val_loss = 0

    with torch.no_grad():  # No need to compute gradients during validation

        for batch in dev_dataloader:

            inputs = {k: v.to(device) for k, v in batch.items()}

            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

            loss = mse_loss(outputs, inputs['labels'].long())

            val_loss += loss.item()



    # Calculate average losses

    avg_train_loss = train_loss / len(train_dataloader)

    avg_val_loss = val_loss / len(dev_dataloader)



    # Append the losses for plotting

    train_losses.append(avg_train_loss)

    val_losses.append(avg_val_loss)



    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

Epoch 1/4, Train Loss: 0.6530, Validation Loss: 0.5979
Epoch 2/4, Train Loss: 0.5363, Validation Loss: 0.5246
Epoch 3/4, Train Loss: 0.4491, Validation Loss: 0.5070
Epoch 4/4, Train Loss: 0.3974, Validation Loss: 0.6499


### Evaluation of Fine-Tuned Model

- Evaluate fine-tuned model on test dataset of each language

In [10]:
# Load datasets for each language

languages = ["Hindi", "Marathi", "Bengali", "Tamil", "Telugu"]



from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import f1_score



with torch.no_grad():

    for lang in languages:

        test_file = f"/kaggle/working/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_test.csv"

        test_df = pd.read_csv(test_file)

        test_dataloader = DataLoader(MultilingualSentimentAnalysis_Dataset(test_df, tokenizer), batch_size=16, shuffle=True)

        

        # Make list for predicted labels and ground truth labels

        predicted_labels = []

        labels = []



        # Perform inference

        for batch in test_dataloader:

            inputs = {k: v.to(device) for k, v in batch.items()}

            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

            predicted_labels.extend(torch.argmax(outputs, dim=1).tolist())

            labels.extend(inputs['labels'].long().tolist())

        



        # Print results for a particular language

        print(f"RESULTS FOR {lang}")

        print()

        # Calculate accuracy

        accuracy = accuracy_score(labels, predicted_labels) 

        print(f'Accuracy: {accuracy:.4f}')



        # Calculate F1-score

        weighted_f1_score = f1_score(labels, predicted_labels, average='weighted')

        macro_f1_score = f1_score(labels, predicted_labels, average='macro')

        print(f'Weighted F1-score: {weighted_f1_score:.4f}')

        print(f'Macro F1-score: {macro_f1_score:.4f}')

        print()



        # Print classification report

        print("Classification Report")

        print(classification_report(labels, predicted_labels))

        print()

        # Print confusion matrix

        print("Confusion Matrix")

        print(confusion_matrix(labels, predicted_labels))

        print()

        print("----------*******************---------------")

    

RESULTS FOR Hindi

Accuracy: 0.6140
Weighted F1-score: 0.5724
Macro F1-score: 0.5724

Classification Report
              precision    recall  f1-score   support

           0       0.80      0.30      0.44       500
           1       0.57      0.93      0.71       500

    accuracy                           0.61      1000
   macro avg       0.69      0.61      0.57      1000
weighted avg       0.69      0.61      0.57      1000


Confusion Matrix
[[151 349]
 [ 37 463]]

----------*******************---------------
RESULTS FOR Marathi

Accuracy: 0.5350
Weighted F1-score: 0.4470
Macro F1-score: 0.4470

Classification Report
              precision    recall  f1-score   support

           0       0.67      0.14      0.23       500
           1       0.52      0.93      0.67       500

    accuracy                           0.54      1000
   macro avg       0.60      0.54      0.45      1000
weighted avg       0.60      0.54      0.45      1000


Confusion Matrix
[[ 68 432]
 [ 33 467]]


In [11]:
# Save Model and Tokenizer

model_save_path = "IndicBERT_HS_Bn-Hi_FineTune.pth"

torch.save(model.state_dict(), model_save_path)