- Importing Libraries

In [93]:
import torch
from transformers import BertModel
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
from transformers import BertTokenizer
import string

### Load Dataset for the Particular Language
- Here insert the language initial you want to load the train and test CSVs for
- Language Codes:
    1. Hindi: `hi`
    2. Bengali: `bn`
    3. Marathi: `mr`
    4. Tamil: `ta`
    5. Telugu: `te`

In [94]:
lang = 'Telugu'
train_file = f"/kaggle/input/hatespeech/Dataset/{lang}_train.csv"
test_file = f"/kaggle/input/hatespeech/Dataset/{lang}_test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# only taking the first 1000 smples for now
train_df = train_df[:1000]
test_df = test_df[:1000]

In [95]:
train_df

Unnamed: 0,transliterated_text,Label,Labels Set,Processed_Post
0,అది సిగ్రేట్ తాగుతుంది అబ్జర్వ్ చేసారా,hate,1,అది సిగ్రేట్ తాగుతుంది అబ్జర్వ్ చేసారా
1,ని న్యూస్ ని ఓవర్ యాక్షన్ కి దండం రా బాబు,non-hate,0,ని న్యూస్ ని ఓవర్ యాక్షన్ కి దండం రా బాబు
2,తమ పార్టీ గుర్తుపై గెలిచిన 23 మందిని గత ప్రభుత...,yes,1,తమ పార్టీ గుర్తుపై గెలిచిన 23 మందిని గత ప్రభుత...
3,ఎదుటి మనిషికి చెప్పేటందుకే నీతులు ఉన్నాయి.,non-hate,0,ఎదుటి మనిషికి చెప్పేటందుకే నీతులు ఉన్నాయి.
4,సూపర్ సార్ మీరు చంద్రబాబు నాయుడుకి కావాల్సింది...,non-hate,0,సూపర్ సార్ మీరు చంద్రబాబు నాయుడుకి కావాల్సింది...
...,...,...,...,...
995,దీంతో షఉటింగ్‌ ప్రపంచకప్‌ టోర్నీలో భారత్‌ ఖాతా...,no,0,దీంతో షఉటింగ్‌ ప్రపంచకప్‌ టోర్నీలో భారత్‌ ఖాతా...
996,జై కుమార్ నెక్స్ట్ నీ అమ్మ నీ సిస్టర్స్ నీ అంద...,hate,1,జై కుమార్ నెక్స్ట్ నీ అమ్మ నీ సిస్టర్స్ నీ అంద...
997,"న్యూఢిల్లీ : మాన్కడింగ్‌, నోబాల్స్‌, ఫీల్డ్‌లో...",yes,1,"న్యూఢిల్లీ : మాన్కడింగ్‌, నోబాల్స్‌, ఫీల్డ్‌లో..."
998,"అలా జరగకపోతే ఒక్కడు, పోకిరి, శ్రీమంతుడు లాంటి ...",yes,1,"అలా జరగకపోతే ఒక్కడు, పోకిరి, శ్రీమంతుడు లాంటి ..."


## Custom Dataset Class
- This class can be used for all Sentiment Analysis fine-tuning and zero shot tasks
- For multiple languages, combine data into one dataframe as the dataset class takes DATAFRAME as input

In [96]:
class MultilingualSentimentAnalysis_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        label = self.dataframe.iloc[idx]["Labels Set"]
        input_text = self.dataframe.iloc[idx]["Processed_Post"]
        if pd.isna(input_text):
            input_text = ""

        # Tokenize
        encoding = self.tokenizer.encode_plus(
            input_text, None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'  # Corrected the argument here
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [97]:
# Get the model name
model_name = "bert-base-multilingual-cased"

# Initialise the tokeniser
tokenizer = BertTokenizer.from_pretrained(model_name)

In [98]:
# Initialise dataset instances
train_dataset = MultilingualSentimentAnalysis_Dataset(train_df, tokenizer)
dev_dataset = MultilingualSentimentAnalysis_Dataset(test_df, tokenizer)

# Initialise the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle=False)

In [99]:
import torch
import torch.nn as nn

class MBertForSentimentAnalysis(nn.Module):
    def __init__(self, freeze_bert=False):
        super(MBertForSentimentAnalysis, self).__init__()

        # Load mBERT model and tokenizer
        self.model_name = "bert-base-multilingual-cased"
        # tokenizer = BertTokenizer.from_pretrained(model_name)
        self.mbert = BertModel.from_pretrained(self.model_name)

        # Add a batch normalization layer
        self.batch_norm = nn.BatchNorm1d(self.mbert.config.hidden_size)
        
        # Add a linear layer for classification
        self.classification = nn.Linear(self.mbert.config.hidden_size, 2)

        # Option to freeze MBERT layers to prevent them from being updated during training
        if freeze_bert:
            for param in self.mbert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Get the output from BERT model
        _, pooled_outputs = self.mbert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)

        # Pass output through batch normalization layer
        pooled_outputs = self.batch_norm(pooled_outputs)
        
        # Pass output through linear layer
        out = self.classification(pooled_outputs)
        return out

- Pass freeze as true when doing zero shot to prevent BERT from getting fine-tuned

In [100]:
# Freeze BERT encoder to prevent fine-tuning
model = MBertForSentimentAnalysis()
if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)  # Move model to CUDA device if available
    print("Using CUDA")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

Using CUDA


In [101]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
mse_loss = torch.nn.CrossEntropyLoss()

num_epochs = 4

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    # Training
    model.train()  
    train_loss = 0
    for batch in train_dataloader:
        # Forward pass
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        loss = mse_loss(outputs, inputs['labels'].long())

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    # Validation
    model.eval()  
    val_loss = 0
    with torch.no_grad():  # No need to compute gradients during validation
        for batch in dev_dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            loss = mse_loss(outputs, inputs['labels'].long())
            val_loss += loss.item()

    # Calculate average losses
    avg_train_loss = train_loss / len(train_dataloader)
    avg_val_loss = val_loss / len(dev_dataloader)

    # Append the losses for plotting
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

Epoch 1/4, Train Loss: 0.6855, Validation Loss: 0.6342
Epoch 2/4, Train Loss: 0.5529, Validation Loss: 0.5974
Epoch 3/4, Train Loss: 0.4315, Validation Loss: 0.6392
Epoch 4/4, Train Loss: 0.2803, Validation Loss: 0.8407


### Evaluation of Fine-Tuned Model
- Evaluate fine-tuned model on test dataset of each language

In [102]:
### Load datasets for each language
languages = ["Hindi", "Marathi", "Bengali", "Tamil", "Telugu"]

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

with torch.no_grad():
    for lang in languages:
        test_file = f"/kaggle/input/hatespeech/Dataset/{lang}_test.csv"
        test_df = pd.read_csv(test_file)
        test_dataloader = DataLoader(MultilingualSentimentAnalysis_Dataset(test_df, tokenizer), batch_size=16, shuffle=True)
        
        # Make list for predicted labels and ground truth labels
        predicted_labels = []
        labels = []

        # Perform inference
        for batch in test_dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            predicted_labels.extend(torch.argmax(outputs, dim=1).tolist())
            labels.extend(inputs['labels'].long().tolist())
        

        # Print results for a particular language
        print(f"RESULTS FOR {lang}")
        print()
        # Calculate accuracy
        accuracy = accuracy_score(labels, predicted_labels) 
        print(f'Accuracy: {accuracy:.4f}')

        # Calculate F1-score
        weighted_f1_score = f1_score(labels, predicted_labels, average='weighted')
        macro_f1_score = f1_score(labels, predicted_labels, average='macro')
        print(f'Weighted F1-score: {weighted_f1_score:.4f}')
        print(f'Macro F1-score: {macro_f1_score:.4f}')
        print()

        # Print classification report
        print("Classification Report")
        print(classification_report(labels, predicted_labels))
        print()
        # Print confusion matrix
        print("Confusion Matrix")
        print(confusion_matrix(labels, predicted_labels))
        print()
        print("----------*******************---------------")
    

RESULTS FOR Hindi

Accuracy: 0.5790
Weighted F1-score: 0.5451
Macro F1-score: 0.5451

Classification Report
              precision    recall  f1-score   support

           0       0.67      0.31      0.42       500
           1       0.55      0.85      0.67       500

    accuracy                           0.58      1000
   macro avg       0.61      0.58      0.55      1000
weighted avg       0.61      0.58      0.55      1000


Confusion Matrix
[[153 347]
 [ 74 426]]

----------*******************---------------
RESULTS FOR Marathi

Accuracy: 0.6820
Weighted F1-score: 0.6777
Macro F1-score: 0.6777

Classification Report
              precision    recall  f1-score   support

           0       0.74      0.57      0.64       500
           1       0.65      0.80      0.72       500

    accuracy                           0.68      1000
   macro avg       0.69      0.68      0.68      1000
weighted avg       0.69      0.68      0.68      1000


Confusion Matrix
[[283 217]
 [101 399]]


In [103]:
# Save Model and Tokenizer
model_save_path = "MBERT_HS1000_Te_FineTune.pth"
torch.save(model.state_dict(), model_save_path)