In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

In [3]:
dataset = load_dataset("ai4bharat/IndicSentiment", name="translation-bn")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/980 [00:00<?, ?B/s]

IndicSentiment.py:   0%|          | 0.00/2.51k [00:00<?, ?B/s]

The repository for ai4bharat/IndicSentiment contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ai4bharat/IndicSentiment.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


bn.json:   0%|          | 0.00/178k [00:00<?, ?B/s]

bn.json:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [1]:
dataset['test']['INDIC REVIEW'][0]

NameError: name 'dataset' is not defined

In [None]:
training_texts=dataset['test']['INDIC REVIEW']
validation_texts=dataset['validation']['INDIC REVIEW']
training_labels=dataset['test']['LABEL']
validation_labels=dataset['validation']['LABEL']

In [None]:
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
class SentimentDataset(Dataset):
  def __init__(self,texts,labels,tokenizer):
    super(SentimentDataset,self).__init__()
    self.encodings=tokenizer(texts,padding=True,truncation=True,return_tensors='pt')
    self.labels=torch.tensor([1 if label=='positive' else 0 for label in labels])
  def __getitem__(self,index):
    item={key:value[index] for key,value in self.encodings.items()}
    item['labels']=self.labels[index]
    return item
  def __len__(self):
    return len(self.labels)

In [None]:
training_data=SentimentDataset(training_texts,training_labels,tokenizer)
validation_data=SentimentDataset(validation_texts,validation_labels,tokenizer)
train_loader=DataLoader(training_data,batch_size=4,shuffle=True)
val_loader=DataLoader(validation_data,batch_size=4,shuffle=False)

**CustomModel by adding a Dense layer over BERT output**

In [None]:
import torch.nn as nn
from transformers import BertModel

class CustomBertModel(nn.Module):
    def __init__(self, hidden_size=768, num_labels=2):
        super(CustomBertModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased',num_labels=num_labels)
        self.classification = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels)
        )
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        logits = self.classification(cls_output)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {'loss': loss, 'logits': logits}
        else:
            return {'logits': logits}


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomBertModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, labels)
        loss = outputs['loss']
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")

Epoch 1 - Training Loss: 0.0193
Epoch 2 - Training Loss: 0.0005
Epoch 3 - Training Loss: 0.0002


In [None]:
from sklearn.metrics import accuracy_score
# Evaluation
model.eval()
all_preds = []
all_labels = []
total_loss=0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        total_loss += loss.item()
        logits = outputs['logits']
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f},Validation Loss:{total_loss/len(val_loader)}")


Validation Accuracy: 1.0000,Validation Loss:6.86484371470168e-05


**CustomModel by adding LSTM layer over BERT output**

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertWithLSTM(nn.Module):
    def __init__(self, hidden_size=768, lstm_hidden=256, num_labels=3):
        super(BertWithLSTM, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.lstm = nn.LSTM(input_size=hidden_size,hidden_size=lstm_hidden,num_layers=1,bidirectional=True,batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(lstm_hidden * 2, num_labels)  # *2 for bidirectional
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

        # LSTM
        lstm_output, _ = self.lstm(sequence_output)  # shape: (batch_size, seq_len, lstm_hidden*2)

        # Use the hidden state from the first token or pool
        pooled_output = torch.mean(lstm_output,dim=1)  # Use mean LSTM output

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}


In [None]:
model1=BertWithLSTM().to(device)
optimizer=torch.optim.AdamW(model1.parameters(),lr=2e-5)

In [None]:
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model1(input_ids, attention_mask, labels)
        loss = outputs['loss']
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")

Epoch 1 - Training Loss: 0.0342
Epoch 2 - Training Loss: 0.0004
Epoch 3 - Training Loss: 0.0002


In [None]:
from sklearn.metrics import accuracy_score
# Evaluation
model.eval()
all_preds = []
all_labels = []
total_loss=0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model1(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        total_loss += loss.item()
        logits = outputs['logits']
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f},Validation Loss:{total_loss/len(val_loader)}")


Validation Accuracy: 1.0000,Validation Loss:0.00016231458385636171


**Customizing BERT Parameters**

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from transformers import BertConfig, BertForSequenceClassification

# Step 1: Define custom BERT configuration
custom_config = BertConfig(
    hidden_size=512,               # default is 768
    num_attention_heads=8,         # default is 12
    num_hidden_layers=6,           # default is 12
    intermediate_size=1024,        # default is 3072 (feedforward layer)
    num_labels=2,
)

# Step 2: Initialize model from scratch with this config
model2 = BertForSequenceClassification(config=custom_config).to(device)
optimizer=torch.optim.AdamW(model2.parameters(),lr=2e-5)

In [None]:
# Training loop
model2.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        batch['input_ids'] = batch['input_ids'].clamp(0, model2.config.vocab_size - 1)
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model2(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")

Epoch 1 - Training Loss: 0.0159
Epoch 2 - Training Loss: 0.0008
Epoch 3 - Training Loss: 0.0004
