In [None]:
!pip install -q transformers datasets scikit-learn accelerate


In [None]:
import numpy as np
print(f"NumPy version: {np.__version__}")  # Should show 1.26.4

NumPy version: 2.0.2


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score
import torch
import numpy as np
import pandas as pd

In [None]:
model_name = "aryanx16/mahabert-v2-daft"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: pos/neg/neutral


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aryanx16/mahabert-v2-daft and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Val.csv")
test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Test.csv')
train_df.head()
# val_df.head()
# test_df.head()

Unnamed: 0.1,Unnamed: 0,marathi_text,label
0,0,‡§Æ‡•Ä ‡§§‡•ã ‡§Æ‡•Ç‡§∞‡•ç‡§ñ ‡§Ü‡§π‡•á.,-1
1,1,‡§Ü‡§à‡§®‡•á ‡§Ü‡§§‡•ç‡§Æ‡§π‡§§‡•ç‡§Ø‡§æ ‡§ï‡•á‡§≤‡•Ä ‡§Ü‡§£‡§ø ‡§Æ‡§æ‡§ù‡•á ‡§µ‡§°‡•Ä‡§≤ ‡§™‡§≥‡•Ç‡§® ‡§ó‡•á‡§≤‡•á.,-1
2,2,- ‡§§‡•ç‡§Ø‡§æ‡§Ç‡§®‡•Ä ‡§´‡§ï‡•ç‡§§ ‡§ú‡•â‡§Ø‡§≤‡§æ ‡§Æ‡§æ‡§∞‡§≤‡•á?,-1
3,3,‡§Æ‡§≤‡§æ ‡§Æ‡§æ‡§´ ‡§ï‡§∞‡§æ ‡§Æ‡•Ä ‡§Ü‡§ú ‡§§‡§ø‡§•‡•á‡§ö ‡§ï‡•á‡§≤‡•á ‡§®‡§æ‡§π‡•Ä.,-1
4,4,"‡§®‡§æ‡§π‡•Ä, ‡§®‡§ï‡•ç‡§ï‡•Ä‡§ö ‡§®‡§æ‡§π‡•Ä.",-1


In [None]:
train_df["label"] += 1
val_df["label"] += 1
test_df["label"] += 1
train_df.head()
train_df.dropna(subset=["marathi_text", "label"], inplace=True)
val_df.dropna(subset=["marathi_text", "label"], inplace=True)
test_df.dropna(subset=["marathi_text", "label"], inplace=True)
print(train_df.isnull().sum())

Unnamed: 0      0
marathi_text    0
label           0
dtype: int64


In [None]:
# Step 3: Convert to Hugging Face Dataset format
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
#  Step 4: Tokenize the tweets
def tokenize_function(batch):
    return tokenizer(batch["marathi_text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# Format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

In [None]:

#  Initialize Trainer

training_args = TrainingArguments(
    output_dir="./results",                   # üìÅ Where to save checkpoints
    eval_strategy="epoch",              # üß™ Evaluate after each epoch
    save_strategy="epoch",                    # üíæ Save model after each epoch
    per_device_train_batch_size=8,            # üß† Reduce memory usage
    per_device_eval_batch_size=8,             # ‚öñÔ∏è Matching batch size for eval
    num_train_epochs=2,                       # üîÅ You can try 1 or 2 for speed
    learning_rate=2e-5,                       # üöÄ Good default for BERT
    warmup_steps=100,                         # üî• Warmup helps training stability
    weight_decay=0.01,                        # üìâ Helps avoid overfitting
    logging_dir="./logs",                     # üìÇ Logs directory
    logging_steps=50,                         # üìù Log every 50 steps
    load_best_model_at_end=True,              # üèÜ Automatically load best model
    metric_for_best_model="accuracy",         # üéØ Use accuracy to pick best
    save_total_limit=1,                       # üíæ Keep only last best model
    fp16=True,                                # ‚ö° Mixed precision for faster training
    report_to="none",                         # üìä Turn off W&B if you‚Äôre not using it
)
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",          # prefer this name
#     save_strategy="epoch",
#     per_device_train_batch_size=64,       # if OOM, fall back to 16 or 32
#     per_device_eval_batch_size=64,
#     num_train_epochs=2,
#     learning_rate=2e-5,
#     warmup_steps=100,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=50,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     save_total_limit=1,
#     fp16=True,
#     report_to="none",
# )


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# Step 5: Train!
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.594,0.517167,0.81
2,0.5237,0.494119,0.826


TrainOutput(global_step=3000, training_loss=0.5874691721598307, metrics={'train_runtime': 446.2594, 'train_samples_per_second': 53.78, 'train_steps_per_second': 6.723, 'total_flos': 1578680506368000.0, 'train_loss': 0.5874691721598307, 'epoch': 2.0})

In [None]:
# After training
val_metrics = trainer.evaluate()          # still okay to see validation
print("Validation metrics:", val_metrics)

test_metrics = trainer.evaluate(test_dataset)   # ‚úÖ this matches the paper
print("Test metrics:", test_metrics)


Validation metrics: {'eval_loss': 0.49411913752555847, 'eval_accuracy': 0.826, 'eval_runtime': 3.8552, 'eval_samples_per_second': 389.085, 'eval_steps_per_second': 48.765, 'epoch': 2.0}
Test metrics: {'eval_loss': 0.5530756711959839, 'eval_accuracy': 0.7933333333333333, 'eval_runtime': 3.5757, 'eval_samples_per_second': 419.5, 'eval_steps_per_second': 52.577, 'epoch': 2.0}
