In [1]:
from transformers import AutoModel,AutoTokenizer,AutoModelForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import  AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch import  nn
import torch.nn.functional as F
import re

In [2]:
df = pd.read_csv("PS_train.csv")

le = LabelEncoder()
df['encoded_labels'] = le.fit_transform(df['labels'])
num_classes = len(le.classes_)

In [3]:
df

Unnamed: 0,content,labels,encoded_labels
0,தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் ...,Neutral,1
1,அண்ணன் இதனை சூசகமாக 11 மாதங்கள் முன்பே பேட்டிய...,Substantiated,6
2,ஒரு வருடம் ஆகி விட்டது இந்த துயரம் நேர்ந்து......,Opinionated,3
3,"எடப்பாடியை கண்டுகொள்ளாத ""எடப்பாடி""🫢\n ---\nஆதர...",Positive,4
4,எங்களின் அரசியல் அடுத்த தலைமுறைக்குமானது \n#மக...,Opinionated,3
...,...,...,...
4347,"பள்ளி , கல்லூரி படிப்பை இலவசமாக தாருங்கள் பேரு...",Substantiated,6
4348,இஸ்லாமியர்களின் பாதுகாவலர்கள் திமுகவினரா? உண்ம...,Opinionated,3
4349,டெல்டா மாவட்ட மக்களோடு... மக்களாக 😍\n\n#மக்களி...,Positive,4
4350,நன்றி,Opinionated,3


In [7]:
def remove_hashtags_and_emojis(text):
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove emojis
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF" 
        "\U0001F680-\U0001F6FF" 
        "\U0001F1E0-\U0001F1FF" 
        "\U00002700-\U000027BF" 
        "\U000024C2-\U0001F251" 
        "\U0001F900-\U0001F9FF" 
        "\U0001FA70-\U0001FAFF" 
        "\U00002500-\U00002BEF" 
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub('', text)
    return text

df['content'] = df['content'].apply(remove_hashtags_and_emojis)

In [8]:
df

Unnamed: 0,content,labels,encoded_labels
0,தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் ...,Neutral,1
1,அண்ணன் இதனை சூசகமாக 11 மாதங்கள் முன்பே பேட்டிய...,Substantiated,6
2,ஒரு வருடம் ஆகி விட்டது இந்த துயரம் நேர்ந்து......,Opinionated,3
3,"எடப்பாடியை கண்டுகொள்ளாத ""எடப்பாடி""\n ---\nஆதரி...",Positive,4
4,எங்களின் அரசியல் அடுத்த தலைமுறைக்குமானது \n,Opinionated,3
...,...,...,...
4347,"பள்ளி , கல்லூரி படிப்பை இலவசமாக தாருங்கள் பேரு...",Substantiated,6
4348,இஸ்லாமியர்களின் பாதுகாவலர்கள் திமுகவினரா? உண்ம...,Opinionated,3
4349,டெல்டா மாவட்ட மக்களோடு... மக்களாக \n\n \n,Positive,4
4350,நன்றி,Opinionated,3


In [9]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        p_t = torch.exp(-ce_loss)  # Probabilities for the correct class
        focal_loss = self.alpha * (1 - p_t) ** self.gamma * ce_loss
        return focal_loss.mean()

In [10]:
class TamilTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def compute_metrics(predictions, labels):
    # Convert predictions and labels to PyTorch tensors if they aren't already
    if not isinstance(predictions, torch.Tensor):
        predictions = torch.tensor(predictions)
    if not isinstance(labels, torch.Tensor):
        labels = torch.tensor(labels)
    
    correct = (predictions == labels).sum().item()
    total = len(labels)
    accuracy = correct / total
    return accuracy


In [11]:
class TamilTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("Vidharshana/tamil-bert4MLM",cache_dir='models/tamil-bert4MLM',num_labels=num_classes).to('cuda')

tokenizer = AutoTokenizer.from_pretrained('vishnun/bert-base-cased-tamil-mix-sentiment',cache_dir='models/bert-base-cased-tamil-mix-sentiment')

text = "தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் டாக்டர்"
tokens = tokenizer(text, return_tensors="pt").to('cuda')
print(tokens)

output = model(**tokens)
print(output)
print(type(output))
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Vidharshana/tamil-bert4MLM and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

{'input_ids': tensor([[  101,   100,   100,   673, 28584, 28575, 28583, 28578,   100,   100,
           100,   100,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [10]:
def train_model(model, train_loader, val_loader, device, num_epochs):
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    focal_loss = FocalLoss(alpha=1.0, gamma=2.0).to(device)

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch + 1}/{num_epochs}')

        # Training
        model.train()
        total_train_loss = 0
        train_predictions = []
        train_true_labels = []

        for batch in tqdm(train_loader, desc='Training'):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=None
            )

            loss = focal_loss(outputs.logits, labels)
            total_train_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            train_predictions.extend(predictions.cpu())
            train_true_labels.extend(labels.cpu())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = compute_metrics(
            torch.stack(train_predictions),
            torch.stack(train_true_labels)
        )

        # Validation
        model.eval()
        total_val_loss = 0
        val_predictions = []
        val_true_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=None
                )

                loss = focal_loss(outputs.logits, labels)
                total_val_loss += loss.item()

                predictions = torch.argmax(outputs.logits, dim=1)
                val_predictions.extend(predictions.cpu())
                val_true_labels.extend(labels.cpu())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = compute_metrics(
            torch.stack(val_predictions),
            torch.stack(val_true_labels)
        )

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'bestmodels/indic_bert_nohashtags_best_model.pt')

        print(f'Average training loss: {avg_train_loss:.4f}, Training accuracy: {train_accuracy:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}, Validation accuracy: {val_accuracy:.4f}')


In [11]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['encoded_labels'])

train_dataset = TamilTextDataset(
        texts=train_df['content'].values,
        labels=train_df['encoded_labels'].values,
        tokenizer=tokenizer
    )
    
val_dataset = TamilTextDataset(
    texts=val_df['content'].values,
    labels=val_df['encoded_labels'].values,
    tokenizer=tokenizer
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=0
)
val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0  # Changed from 2 to 0 for better compatibility
    )
    
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features

In [12]:
print(tokenizer.decode(train_dataset[81]['input_ids']))

[CLS] அத எனன, உன உதயநதய வசச சயயம பணகள.. <unk> [SEP]<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [15]:
train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        num_epochs=30
    )




Epoch 1/30


Training: 100%|██████████| 436/436 [02:19<00:00,  3.14it/s]
Validation: 100%|██████████| 109/109 [00:12<00:00,  9.08it/s]


Average training loss: 1.2468, Training accuracy: 0.3082
Average validation loss: 1.1914, Validation accuracy: 0.3352

Epoch 2/30


Training: 100%|██████████| 436/436 [02:30<00:00,  2.89it/s]
Validation: 100%|██████████| 109/109 [00:12<00:00,  9.07it/s]


Average training loss: 1.1619, Training accuracy: 0.3390
Average validation loss: 1.1787, Validation accuracy: 0.3318

Epoch 3/30


Training: 100%|██████████| 436/436 [02:30<00:00,  2.90it/s]
Validation: 100%|██████████| 109/109 [00:12<00:00,  8.64it/s]


Average training loss: 1.1337, Training accuracy: 0.3444
Average validation loss: 1.1339, Validation accuracy: 0.3467

Epoch 4/30


Training: 100%|██████████| 436/436 [02:32<00:00,  2.87it/s]
Validation: 100%|██████████| 109/109 [00:13<00:00,  8.37it/s]


Average training loss: 1.0880, Training accuracy: 0.3634
Average validation loss: 1.1121, Validation accuracy: 0.3467

Epoch 5/30


Training: 100%|██████████| 436/436 [02:28<00:00,  2.93it/s]
Validation: 100%|██████████| 109/109 [00:12<00:00,  9.06it/s]


Average training loss: 1.0053, Training accuracy: 0.4036
Average validation loss: 1.1343, Validation accuracy: 0.3410

Epoch 6/30


Training: 100%|██████████| 436/436 [02:28<00:00,  2.94it/s]
Validation: 100%|██████████| 109/109 [00:12<00:00,  8.56it/s]


Average training loss: 0.8713, Training accuracy: 0.4766
Average validation loss: 1.2188, Validation accuracy: 0.3490

Epoch 7/30


Training: 100%|██████████| 436/436 [02:31<00:00,  2.88it/s]
Validation: 100%|██████████| 109/109 [00:12<00:00,  8.59it/s]


Average training loss: 0.7199, Training accuracy: 0.5668
Average validation loss: 1.3151, Validation accuracy: 0.3042

Epoch 8/30


Training:  53%|█████▎    | 233/436 [01:19<01:09,  2.93it/s]


KeyboardInterrupt: 

In [10]:
df

Unnamed: 0,content,labels,encoded_labels
0,தென்காசி தொகுதி புதிய தமிழகம் கட்சி வேட்பாளர் ...,Neutral,1
1,அண்ணன் இதனை சூசகமாக 11 மாதங்கள் முன்பே பேட்டிய...,Substantiated,6
2,ஒரு வருடம் ஆகி விட்டது இந்த துயரம் நேர்ந்து......,Opinionated,3
3,"எடப்பாடியை கண்டுகொள்ளாத ""எடப்பாடி""🫢\n ---\nஆதர...",Positive,4
4,எங்களின் அரசியல் அடுத்த தலைமுறைக்குமானது \n#மக...,Opinionated,3
...,...,...,...
4347,"பள்ளி , கல்லூரி படிப்பை இலவசமாக தாருங்கள் பேரு...",Substantiated,6
4348,இஸ்லாமியர்களின் பாதுகாவலர்கள் திமுகவினரா? உண்ம...,Opinionated,3
4349,டெல்டா மாவட்ட மக்களோடு... மக்களாக 😍\n\n#மக்களி...,Positive,4
4350,நன்றி,Opinionated,3
