- Importing Libraries

In [1]:
import torch
from transformers import BertModel
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
from transformers import BertTokenizer
import string

  from .autonotebook import tqdm as notebook_tqdm


### Load Dataset for the Particular Language
- Here insert the language initial you want to load the train and test CSVs for
- Language Codes:
    1. Hindi: `hi`
    2. Bengali: `bn`
    3. Marathi: `mr`
    4. Tamil: `ta`
    5. Telugu: `te`

In [4]:
lang = 'Bengali'
train_file = f"/home/medha/BTP/Arnav_Medha/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_train.csv"
test_file = f"/home/medha/BTP/Arnav_Medha/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# only taking the first 2000 smples for now
train_df = train_df[:2000]

In [5]:
train_df

Unnamed: 0,Post,Labels Set,Category,Dataset,Processed_Post
0,শুধু সাব্বির বেয়াদপ না পুরো ক্রিকেট টিমটাই বেয়...,1,sports,Bengali hate speech,শুধু সাব্বির বেয়াদপ না পুরো ক্রিকেট টিমটাই বেয়...
1,বাংলার মাটির মানুষ মিজানুর রহমান,0,religion,Bengali hate speech,বাংলার মাটির মানুষ মিজানুর রহমান
2,"টিনের চালে কাওয়া, তুমি আমার শাওয়া",1,celebrity,Bengali hate speech,"টিনের চালে কাওয়া, তুমি আমার শাওয়া"
3,জাজদের কে কে ঘন্টা হিসেবে কানে গলায় থাপরাতে চা...,0,entertainment,Bengali hate speech,জাজদের কে কে ঘন্টা হিসেবে কানে গলায় থাপরাতে চা...
4,বেয়াদব,0,"Meme, TikTok and others",Bengali hate speech,বেয়াদব
...,...,...,...,...,...
1995,এরাত হারামির ছাওয়াল,1,religion,Bengali hate speech,এরাত হারামির ছাওয়াল
1996,এ অভাব হয়ে যায় সালা বাটপার তরে কত বার দুলাই করলো,1,religion,Bengali hate speech,এ অভাব হয়ে যায় সালা বাটপার তরে কত বার দুলাই করলো
1997,দাড়িয়ালা এই মাদারচোদ বিচারকটা কে হালা কোথা থে...,1,entertainment,Bengali hate speech,দাড়িয়ালা এই মাদারচোদ বিচারকটা কে হালা কোথা থে...
1998,।চুপ কর শালার পুতেরা।। এই মাগির বাচ্চা গুলাকে ...,1,entertainment,Bengali hate speech,।চুপ কর শালার পুতেরা।। এই মাগির বাচ্চা গুলাকে ...


## Custom Dataset Class
- This class can be used for all Sentiment Analysis fine-tuning and zero shot tasks
- For multiple languages, combine data into one dataframe as the dataset class takes DATAFRAME as input

In [6]:
class MultilingualSentimentAnalysis_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        label = self.dataframe.iloc[idx]["Labels Set"]
        input_text = self.dataframe.iloc[idx]["Processed_Post"]
        if pd.isna(input_text):
            input_text = ""

        # Tokenize
        encoding = self.tokenizer.encode_plus(
            input_text, None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'  # Corrected the argument here
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [7]:
# Get the model name
model_name = "bert-base-multilingual-cased"

# Initialise the tokeniser
tokenizer = BertTokenizer.from_pretrained(model_name)

In [8]:
# Initialise dataset instances
train_dataset = MultilingualSentimentAnalysis_Dataset(train_df, tokenizer)
dev_dataset = MultilingualSentimentAnalysis_Dataset(test_df, tokenizer)

# Initialise the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle=False)

In [9]:
import torch
import torch.nn as nn

class MBertForSentimentAnalysis(nn.Module):
    def __init__(self, freeze_bert=False):
        super(MBertForSentimentAnalysis, self).__init__()

        # Load mBERT model and tokenizer
        self.model_name = "bert-base-multilingual-cased"
        # tokenizer = BertTokenizer.from_pretrained(model_name)
        self.mbert = BertModel.from_pretrained(self.model_name)

        # Add a batch normalization layer
        self.batch_norm = nn.BatchNorm1d(self.mbert.config.hidden_size)
        
        # Add a linear layer for classification
        self.classification = nn.Linear(self.mbert.config.hidden_size, 2)

        # Option to freeze MBERT layers to prevent them from being updated during training
        if freeze_bert:
            for param in self.mbert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Get the output from BERT model
        _, pooled_outputs = self.mbert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)

        # Pass output through batch normalization layer
        pooled_outputs = self.batch_norm(pooled_outputs)
        
        # Pass output through linear layer
        out = self.classification(pooled_outputs)
        return out

- Pass freeze as true when doing zero shot to prevent BERT from getting fine-tuned

In [10]:
# Freeze BERT encoder to prevent fine-tuning
model = MBertForSentimentAnalysis()
if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)  # Move model to CUDA device if available
    print("Using CUDA")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

Using CUDA


In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
mse_loss = torch.nn.CrossEntropyLoss()

num_epochs = 4

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    # Training
    model.train()  
    train_loss = 0
    for batch in train_dataloader:
        # Forward pass
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        loss = mse_loss(outputs, inputs['labels'].long())

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    # Validation
    model.eval()  
    val_loss = 0
    with torch.no_grad():  # No need to compute gradients during validation
        for batch in dev_dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            loss = mse_loss(outputs, inputs['labels'].long())
            val_loss += loss.item()

    # Calculate average losses
    avg_train_loss = train_loss / len(train_dataloader)
    avg_val_loss = val_loss / len(dev_dataloader)

    # Append the losses for plotting
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

Epoch 1/4, Train Loss: 0.5179, Validation Loss: 0.6756
Epoch 2/4, Train Loss: 0.3039, Validation Loss: 0.3941
Epoch 3/4, Train Loss: 0.2093, Validation Loss: 0.3965
Epoch 4/4, Train Loss: 0.1246, Validation Loss: 0.4191


### Evaluation of Fine-Tuned Model
- Evaluate fine-tuned model on test dataset of each language

In [12]:
### Load datasets for each language
languages = ["Hindi", "Marathi", "Bengali", "Tamil", "Telugu"]

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

with torch.no_grad():
    for lang in languages:
        test_file = f"/home/medha/BTP/Arnav_Medha/CSE556-NLP-Project/Hate-Speech-Detection-Experiments/Dataset/{lang}_test.csv"
        test_df = pd.read_csv(test_file)
        test_dataloader = DataLoader(MultilingualSentimentAnalysis_Dataset(test_df, tokenizer), batch_size=16, shuffle=True)
        
        # Make list for predicted labels and ground truth labels
        predicted_labels = []
        labels = []

        # Perform inference
        for batch in test_dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            predicted_labels.extend(torch.argmax(outputs, dim=1).tolist())
            labels.extend(inputs['labels'].long().tolist())
        

        # Print results for a particular language
        print(f"RESULTS FOR {lang}")
        print()
        # Calculate accuracy
        accuracy = accuracy_score(labels, predicted_labels) 
        print(f'Accuracy: {accuracy:.4f}')

        # Calculate F1-score
        weighted_f1_score = f1_score(labels, predicted_labels, average='weighted')
        macro_f1_score = f1_score(labels, predicted_labels, average='macro')
        print(f'Weighted F1-score: {weighted_f1_score:.4f}')
        print(f'Macro F1-score: {macro_f1_score:.4f}')
        print()

        # Print classification report
        print("Classification Report")
        print(classification_report(labels, predicted_labels))
        print()
        # Print confusion matrix
        print("Confusion Matrix")
        print(confusion_matrix(labels, predicted_labels))
        print()
        print("----------*******************---------------")
    

RESULTS FOR Hindi

Accuracy: 0.5290
Weighted F1-score: 0.4252
Macro F1-score: 0.4252

Classification Report
              precision    recall  f1-score   support

           0       0.52      0.95      0.67       500
           1       0.69      0.10      0.18       500

    accuracy                           0.53      1000
   macro avg       0.60      0.53      0.43      1000
weighted avg       0.60      0.53      0.43      1000


Confusion Matrix
[[477  23]
 [448  52]]

----------*******************---------------
RESULTS FOR Marathi

Accuracy: 0.5640
Weighted F1-score: 0.4829
Macro F1-score: 0.4829

Classification Report
              precision    recall  f1-score   support

           0       0.54      0.96      0.69       500
           1       0.81      0.17      0.28       500

    accuracy                           0.56      1000
   macro avg       0.67      0.56      0.48      1000
weighted avg       0.67      0.56      0.48      1000


Confusion Matrix
[[480  20]
 [416  84]]


In [13]:
# Save Model and Tokenizer
model_save_path = "MBERT_HS2000_Bn_FineTune.pth"
torch.save(model.state_dict(), model_save_path)

: 