In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

mr_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Train.csv")
mr_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Val.csv")
mr_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Test.csv')

pt_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-train.csv")
pt_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-valid.csv")
pt_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-test.csv')

st_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Train.csv")
st_val_df   = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Val.csv")
st_test_df  = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Test.csv")

In [None]:
print("MR Train:")
print(mr_train_df.head())
print("\nPT Train:")
print(pt_train_df.head())
print("\nST Train:")
print(st_train_df.head())

MR Train:
   Unnamed: 0                                   marathi_sentence  label
0           0  माने यांचा घटस्फोट झाला आहे तर मोहितेने नुकतेच...     -1
1           1  एका रात्रीत घडणारी किंबहुना बिघडणारी ही गोष्ट आहे     -1
2           2  जरी आघात समजण्यायोग्य आहे जरी चित्रपटाला खराब ...     -1
3           3  पण तो असा आघातही अनुभवत आहे की तो कोणाशीही शेअ...     -1
4           4               छोटे-छोटे गैरसमज मोठ्या अडचणीत येतात     -1

PT Train:
                                               tweet  label
0  ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे व...     -1
1  सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोलिस...     -1
2  उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिकांच...     -1
3  आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्भू...      1
4  बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...     -1

ST Train:
   Unnamed: 0                                  marathi_text  label
0           0                              मी तो मूर्ख आहे.     -1
1           1  आईने आत्महत्या केली आणि माझ

In [None]:
# RENAME COLUMNS → unify to 'text'
mr_train_df = mr_train_df.rename(columns={"marathi_sentence": "text"})
st_train_df = st_train_df.rename(columns={"marathi_text": "text"})
pt_train_df = pt_train_df.rename(columns={"tweet": "text"})

mr_train_df = mr_train_df[["text", "label"]]
st_train_df = st_train_df[["text", "label"]]
pt_train_df = pt_train_df[["text", "label"]]

In [None]:
# MERGE TRAIN: MR + GT + ST
train_df = pd.concat([mr_train_df, pt_train_df, st_train_df], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=42)

In [None]:
# VALIDATION MERGE
mr_val_df = mr_val_df.rename(columns={"marathi_sentence": "text"})
st_val_df = st_val_df.rename(columns={"marathi_text": "text"})
pt_val_df = pt_val_df.rename(columns={"tweet": "text"})

val_df = pd.concat([
    mr_val_df[["text", "label"]],
    pt_val_df[["text", "label"]],
    st_val_df[["text", "label"]],
], ignore_index=True)

val_df = val_df.sample(frac=1, random_state=42)

In [None]:
# TEST MERGE
mr_test_df = mr_test_df.rename(columns={"marathi_sentence": "text"})
st_test_df = st_test_df.rename(columns={"marathi_text": "text"})
pt_test_df = pt_test_df.rename(columns={"tweet": "text"})

test_df = pd.concat([
    mr_test_df[["text", "label"]],
    pt_test_df[["text", "label"]],
    st_test_df[["text", "label"]],
], ignore_index=True)

test_df = test_df.sample(frac=1, random_state=42)

In [None]:
# CLEAN LABELS + TEXT
for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].astype(int) + 1   # -1→0, 0→1, 1→2
    df["text"] = df["text"].astype(str).fillna("")

In [None]:
# Convert to HF Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

In [None]:
model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def tokenize_fn(batch):
    texts = [str(x) for x in batch["text"]]
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_encoded = train_dataset.map(tokenize_fn, batched=True)
val_encoded   = val_dataset.map(tokenize_fn, batched=True)
test_encoded  = test_dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/36114 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5250 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [None]:
# training_args = TrainingArguments(
#     output_dir="./mr_gt_st_marathi_bert_v2",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     logging_steps=50,
#     fp16=True,
#     report_to="none"
# )
training_args = TrainingArguments(
    output_dir="./results",                   # 📁 Where to save checkpoints
    eval_strategy="epoch",              # 🧪 Evaluate after each epoch
    save_strategy="epoch",                    # 💾 Save model after each epoch
    per_device_train_batch_size=64,            # 🧠 Reduce memory usage
    per_device_eval_batch_size=64,             # ⚖️ Matching batch size for eval
    num_train_epochs=2,                       # 🔁 You can try 1 or 2 for speed
    learning_rate=2e-5,                       # 🚀 Good default for BERT
    # warmup_steps=100,                         # 🔥 Warmup helps training stability
    weight_decay=0.01,                        # 📉 Helps avoid overfitting
    logging_dir="./logs",                     # 📂 Logs directory
    logging_steps=50,                         # 📝 Log every 50 steps
    load_best_model_at_end=True,              # 🏆 Automatically load best model
    metric_for_best_model="accuracy",         # 🎯 Use accuracy to pick best
    save_total_limit=1,                       # 💾 Keep only last best model
    fp16=True,                                # ⚡ Mixed precision for faster training
    report_to="none",                         # 📊 Turn off W&B if you’re not using it
    seed=42,         # Standard seed, might be what they used
    data_seed=42,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.6206,0.567858,0.813333,0.814053,0.813333,0.813616
2,0.4955,0.50379,0.817333,0.816264,0.817333,0.816144


TrainOutput(global_step=1130, training_loss=0.6471414127181062, metrics={'train_runtime': 495.1725, 'train_samples_per_second': 145.864, 'train_steps_per_second': 2.282, 'total_flos': 4751038983914496.0, 'train_loss': 0.6471414127181062, 'epoch': 2.0})

In [None]:
test_metrics = trainer.evaluate(test_encoded)
print("Combined MR + PT + ST Test metrics:", test_metrics)

Combined MR + PT + ST Test metrics: {'eval_loss': 0.5116568207740784, 'eval_accuracy': 0.812952380952381, 'eval_precision_macro': 0.8118579041064122, 'eval_recall_macro': 0.8129523809523809, 'eval_f1_macro': 0.8116953271410098, 'eval_runtime': 13.4308, 'eval_samples_per_second': 390.893, 'eval_steps_per_second': 6.18, 'epoch': 2.0}


In [None]:
# ----- MR -----
mr_test_df_processed = mr_test_df.copy()
mr_test_df_processed['label'] = mr_test_df_processed['label'].astype(int) + 1
mr_test_df_processed['text'] = mr_test_df_processed['text'].astype(str).fillna('')
mr_test_dataset = Dataset.from_pandas(mr_test_df_processed)
mr_test_encoded = mr_test_dataset.map(tokenize_fn, batched=True)

mr_test_metrics = trainer.evaluate(mr_test_encoded)
print("MahaSent_MR Test metrics:", mr_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_MR Test metrics: {'eval_loss': 0.5501018762588501, 'eval_accuracy': 0.7993333333333333, 'eval_precision_macro': 0.7983201926305247, 'eval_recall_macro': 0.7993333333333333, 'eval_f1_macro': 0.7975985198546724, 'eval_runtime': 3.9001, 'eval_samples_per_second': 384.609, 'eval_steps_per_second': 6.154, 'epoch': 2.0}


In [None]:
# ----- ST -----
st_test_df_processed = st_test_df.copy()
st_test_df_processed['label'] = st_test_df_processed['label'].astype(int) + 1
st_test_df_processed['text'] = st_test_df_processed['text'].astype(str).fillna('')
st_test_dataset = Dataset.from_pandas(st_test_df_processed)
st_test_encoded = st_test_dataset.map(tokenize_fn, batched=True)

st_test_metrics = trainer.evaluate(st_test_encoded)
print("MahaSent_ST Test metrics:", st_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_ST Test metrics: {'eval_loss': 0.5616782903671265, 'eval_accuracy': 0.78, 'eval_precision_macro': 0.7780777870400574, 'eval_recall_macro': 0.7799999999999999, 'eval_f1_macro': 0.7781939754059909, 'eval_runtime': 3.3465, 'eval_samples_per_second': 448.226, 'eval_steps_per_second': 7.172, 'epoch': 2.0}


In [None]:
# ----- PT -----
pt_test_df_processed = pt_test_df.copy()
pt_test_df_processed['label'] = pt_test_df_processed['label'].astype(int) + 1
pt_test_df_processed['text'] = pt_test_df_processed['text'].astype(str).fillna('')
pt_test_dataset = Dataset.from_pandas(pt_test_df_processed)
pt_test_encoded = pt_test_dataset.map(tokenize_fn, batched=True)

pt_test_metrics = trainer.evaluate(pt_test_encoded)
print("MahaSent_PT Test metrics:", pt_test_metrics)

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

MahaSent_GT Test metrics: {'eval_loss': 0.452679842710495, 'eval_accuracy': 0.844, 'eval_precision_macro': 0.8441497559795949, 'eval_recall_macro': 0.844, 'eval_f1_macro': 0.8435038045784022, 'eval_runtime': 4.8786, 'eval_samples_per_second': 461.195, 'eval_steps_per_second': 7.379, 'epoch': 2.0}


In [None]:
uploaded = files.upload()

Saving tweets-test_PT.csv to tweets-test_PT.csv


In [None]:
# pt_test_df = pd.read_csv("tweets-test_PT.csv")
gt_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_GT/tweets-train.csv")
gt_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_GT/tweets-valid.csv")
gt_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_GT/tweets-test.csv')
gt_test_df_processed = gt_test_df.copy()
gt_test_df_processed = gt_test_df_processed.rename(columns={"tweet": "text"})
gt_test_df_processed["label"] = gt_test_df_processed["label"].astype(int) + 1
gt_test_df_processed["text"] = gt_test_df_processed["text"].astype(str).fillna("")

gt_test_dataset = Dataset.from_pandas(gt_test_df_processed)
gt_test_encoded = gt_test_dataset.map(tokenize_fn, batched=True)

gt_test_metrics = trainer.evaluate(gt_test_encoded)
print("MahaSent_GT Test metrics:", gt_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_PT Test metrics: {'eval_loss': 0.6398805379867554, 'eval_accuracy': 0.7446666666666667, 'eval_precision_macro': 0.7496973381045183, 'eval_recall_macro': 0.7446666666666667, 'eval_f1_macro': 0.7466731772664028, 'eval_runtime': 3.119, 'eval_samples_per_second': 480.922, 'eval_steps_per_second': 7.695, 'epoch': 2.0}
