- Importing Libraries

In [47]:
# !git clone https://github.com/arnav10goel/CSE556-NLP-Project.git
# %cd CSE556-NLP-Project

In [48]:
import torch

from transformers import BertModel

import pandas as pd

from torch.utils.data import Dataset, DataLoader

import pandas as pd

import torch

from transformers import BertTokenizer

import string



### Load Dataset for the Particular Language

- Here insert the language initial you want to load the train and test CSVs for

- Language Codes:

    1. Hindi: `hi`

    2. Bengali: `bn`

    3. Marathi: `mr`

    4. Tamil: `ta`

    5. Telugu: `te`

In [49]:
lang = 'Marathi'

train_file = f"/kaggle/working/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_train.csv"

test_file = f"/kaggle/working/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_test.csv"



train_df = pd.read_csv(train_file)

test_df = pd.read_csv(test_file)

In [50]:
train_df

Unnamed: 0,Post,Labels Set,Dataset,Processed_Post
0,@laxmansingh1663 @SandeepNews_ अतिसुंदर कांड,0,hate_bin_train,अतिसुंदर कांड
1,आज@Dev_Fadnavis ला कोर्टात हजर राहाव लागल. .त्...,1,hate_bin_train,आज ला कोर्टात हजर राहाव लागल त्याची अगदी पेठेत...
2,@cricbuzz अरे नागपूर ला घ्यायची होती न एखाद मॅच 🤬,0,hate_bin_train,अरे नागपूर ला घ्यायची होती न एखाद मॅच
3,@MLARajanSalvi @rautsanjay61 चुतीया... हा शब्द...,1,hate_bin_train,चुतीया हा शब्द शिवसैनिकांसाठी सुसंस्कृत आहे वाटत
4,अबे चुतीया एव्हडा मराठी द्वेष का?? तुमच्या घरी...,1,hate_bin_train,अबे चुतीया एव्हडा मराठी द्वेष का तुमच्या घरी क...
...,...,...,...,...
3995,@marathifeel @Archanagsanap2 छान! गाठभेटी बदल ...,0,hate_bin_train,छान गाठभेटी बदल अभिनंदन फार छान विचार आहेत ताई...
3996,वेड्या मनास आज तू साद उद्याची घाल.. नसेल जरी स...,0,hate_bin_train,वेड्या मनास आज तू साद उद्याची घाल नसेल जरी सोब...
3997,"तो बाईलंपट रामरहीम ,जेलात खडी फोडणारा आसाराम, ...",0,hate_bin_train,तो बाईलंपट रामरहीम जेलात खडी फोडणारा आसाराम नि...
3998,@MChormal @khadaksingh_ त्या छिनाल राऊत च्या ग...,1,hate_bin_train,त्या छिनाल राऊत च्या गांडीत दम आहे का तेवढाबां...


## Custom Dataset Class

- This class can be used for all Sentiment Analysis fine-tuning and zero shot tasks

- For multiple languages, combine data into one dataframe as the dataset class takes DATAFRAME as input

In [51]:
class MultilingualSentimentAnalysis_Dataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_length=256):

        self.tokenizer = tokenizer

        self.dataframe = dataframe

        self.max_length = max_length



    def __len__(self):

        return len(self.dataframe)



    def __getitem__(self, idx):

        label = self.dataframe.iloc[idx]["Labels Set"]

        input_text = self.dataframe.iloc[idx]["Processed_Post"]

        if pd.isna(input_text):

            input_text = ""



        # Tokenize

        encoding = self.tokenizer.encode_plus(

            input_text, None,

            add_special_tokens=True,

            max_length=self.max_length,

            padding='max_length',

            truncation=True,

            return_tensors='pt'  # Corrected the argument here

        )



        return {

            'input_ids': encoding['input_ids'].flatten(),

            'attention_mask': encoding['attention_mask'].flatten(),

            'token_type_ids': encoding['token_type_ids'].flatten(),

            'labels': torch.tensor(label, dtype=torch.float)

        }


In [52]:
from transformers import AutoTokenizer, AutoModel
# Get the model name

model_name = "ai4bharat/indic-bert"



# Initialise the tokeniser

# Initialize IndicBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [53]:
# Initialise dataset instances

train_dataset = MultilingualSentimentAnalysis_Dataset(train_df, tokenizer)

dev_dataset = MultilingualSentimentAnalysis_Dataset(test_df, tokenizer)



# Initialise the dataloaders

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=False)

In [54]:
import torch

import torch.nn as nn



class MBertForSentimentAnalysis(nn.Module):

    def __init__(self, freeze_bert=False):

        super(MBertForSentimentAnalysis, self).__init__()



        # Load mBERT model and tokenizer

        self.model_name = "ai4bharat/indic-bert"

        # tokenizer = BertTokenizer.from_pretrained(model_name)

        self.mbert = AutoModel.from_pretrained(self.model_name)



        # Add a batch normalization layer

        self.batch_norm = nn.BatchNorm1d(self.mbert.config.hidden_size)

        

        # Add a linear layer for classification

        self.classification = nn.Linear(self.mbert.config.hidden_size, 2)



        # Option to freeze MBERT layers to prevent them from being updated during training

        if freeze_bert:

            for param in self.mbert.parameters():

                param.requires_grad = False



    def forward(self, input_ids, attention_mask):

        # Get the output from BERT model

        _, pooled_outputs = self.mbert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)



        # Pass output through batch normalization layer

        pooled_outputs = self.batch_norm(pooled_outputs)

        

        # Pass output through linear layer

        out = self.classification(pooled_outputs)

        return out

- Pass freeze as true when doing zero shot to prevent BERT from getting fine-tuned

In [55]:
# Freeze BERT encoder to prevent fine-tuning

model = MBertForSentimentAnalysis(freeze_bert=True)

if torch.cuda.is_available():

    device = torch.device("cuda")

    model.to(device)  # Move model to CUDA device if available

    print("Using CUDA")

else:

    device = torch.device("cpu")

    print("CUDA is not available. Using CPU instead.")

Using CUDA


In [56]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

mse_loss = torch.nn.CrossEntropyLoss()



num_epochs = 4



train_losses = []

val_losses = []



for epoch in range(num_epochs):

    # Training

    model.train()  

    train_loss = 0

    for batch in train_dataloader:

        # Forward pass

        inputs = {k: v.to(device) for k, v in batch.items()}

        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

        loss = mse_loss(outputs, inputs['labels'].long())



        # Backward pass and optimize

        loss.backward()

        optimizer.step()

        optimizer.zero_grad()



        train_loss += loss.item()



    # Validation

    model.eval()  

    val_loss = 0

    with torch.no_grad():  # No need to compute gradients during validation

        for batch in dev_dataloader:

            inputs = {k: v.to(device) for k, v in batch.items()}

            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

            loss = mse_loss(outputs, inputs['labels'].long())

            val_loss += loss.item()



    # Calculate average losses

    avg_train_loss = train_loss / len(train_dataloader)

    avg_val_loss = val_loss / len(dev_dataloader)



    # Append the losses for plotting

    train_losses.append(avg_train_loss)

    val_losses.append(avg_val_loss)



    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

Epoch 1/4, Train Loss: 0.7464, Validation Loss: 0.6947
Epoch 2/4, Train Loss: 0.7085, Validation Loss: 0.7053
Epoch 3/4, Train Loss: 0.6915, Validation Loss: 0.6958
Epoch 4/4, Train Loss: 0.6818, Validation Loss: 0.6865


### Evaluation of Fine-Tuned Model

- Evaluate fine-tuned model on test dataset of each language

In [57]:
# Load datasets for each language

languages = ["Marathi"]



from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import f1_score



with torch.no_grad():

    for lang in languages:

        test_file = f"/kaggle/working/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_test.csv"

        test_df = pd.read_csv(test_file)

        test_dataloader = DataLoader(MultilingualSentimentAnalysis_Dataset(test_df, tokenizer), batch_size=16, shuffle=True)

        

        # Make list for predicted labels and ground truth labels

        predicted_labels = []

        labels = []



        # Perform inference

        for batch in test_dataloader:

            inputs = {k: v.to(device) for k, v in batch.items()}

            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

            predicted_labels.extend(torch.argmax(outputs, dim=1).tolist())

            labels.extend(inputs['labels'].long().tolist())

        



        # Print results for a particular language

        print(f"RESULTS FOR {lang}")

        print()

        # Calculate accuracy

        accuracy = accuracy_score(labels, predicted_labels) 

        print(f'Accuracy: {accuracy:.4f}')



        # Calculate F1-score

        weighted_f1_score = f1_score(labels, predicted_labels, average='weighted')

        macro_f1_score = f1_score(labels, predicted_labels, average='macro')

        print(f'Weighted F1-score: {weighted_f1_score:.4f}')

        print(f'Macro F1-score: {macro_f1_score:.4f}')

        print()



        # Print classification report

        print("Classification Report")

        print(classification_report(labels, predicted_labels))

        print()

        # Print confusion matrix

        print("Confusion Matrix")

        print(confusion_matrix(labels, predicted_labels))

        print()

        print("----------*******************---------------")

    

RESULTS FOR Marathi

Accuracy: 0.5710
Weighted F1-score: 0.5710
Macro F1-score: 0.5710

Classification Report
              precision    recall  f1-score   support

           0       0.57      0.58      0.57       500
           1       0.57      0.57      0.57       500

    accuracy                           0.57      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.57      0.57      0.57      1000


Confusion Matrix
[[288 212]
 [217 283]]

----------*******************---------------


In [58]:
# Save Model and Tokenizer

model_save_path = "IndicBERT_HS_Mr_ZeroShot.pth"

torch.save(model.state_dict(), model_save_path)