In [None]:
!pip install -q transformers datasets scikit-learn accelerate


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
mr_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Train.csv")
mr_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Val.csv")
mr_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Test.csv')
pt_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-train.csv")
pt_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-valid.csv")
pt_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-test.csv')

In [None]:
print(mr_train_df.head())
print("\n")
print(pt_train_df.head())

   Unnamed: 0                                   marathi_sentence  label
0           0  माने यांचा घटस्फोट झाला आहे तर मोहितेने नुकतेच...     -1
1           1  एका रात्रीत घडणारी किंबहुना बिघडणारी ही गोष्ट आहे     -1
2           2  जरी आघात समजण्यायोग्य आहे जरी चित्रपटाला खराब ...     -1
3           3  पण तो असा आघातही अनुभवत आहे की तो कोणाशीही शेअ...     -1
4           4               छोटे-छोटे गैरसमज मोठ्या अडचणीत येतात     -1


                                               tweet  label
0  ज्येष्ठ पत्रकार अनंत दीक्षित यांच्या निधनाचे व...     -1
1  सर्वोच्च न्यायालयाचे निर्देश डावलून पुणे पोलिस...     -1
2  उद्धव ठाकरेंनी भाजपासोबत युती करून शिवसैनिकांच...     -1
3  आपला समाज खूप मोठा आहे. त्यात अनेक घटक अंतर्भू...      1
4  बलात्काराचा बदला बलात्काराने घेतला पाहिजे हे म...     -1


In [None]:
# Standardize column names by renaming text fields to 'text'

#Training
mr_train_df = mr_train_df.rename(columns={"marathi_sentence": "text"})
pt_train_df = pt_train_df.rename(columns={"tweet": "text"})
mr_train_df = mr_train_df[["text", "label"]]
pt_train_df = pt_train_df[["text", "label"]]

# Combine MR and PT datasets into a single train dataframe
train_df = pd.concat([mr_train_df, pt_train_df], ignore_index=True)

# Shuffle the datasets to avoid ordering bias during training
train_df = train_df.sample(frac=1, random_state=42)

In [None]:
#similarly for validation and test
# Validation
mr_val_df = mr_val_df.rename(columns={"marathi_sentence": "text"})
pt_val_df = pt_val_df.rename(columns={"tweet": "text"})
val_df = pd.concat([mr_val_df[["text", "label"]], pt_val_df[["text", "label"]]], ignore_index=True)
val_df = val_df.sample(frac=1, random_state=42)
# Test
mr_test_df = mr_test_df.rename(columns={"marathi_sentence": "text"})
pt_test_df = pt_test_df.rename(columns={"tweet": "text"})
test_df = pd.concat([mr_test_df[["text", "label"]], pt_test_df[["text", "label"]]], ignore_index=True)
test_df = test_df.sample(frac=1, random_state=42)


In [None]:
# Convert original sentiment labels to numeric IDs for model training

for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].astype(int)
    df["label"] = df["label"] + 1  # -1->0, 0->1, 1->2


In [None]:
# Convert pandas dataframes into HuggingFace Dataset objects

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)


In [None]:
# Tokenize text: pad, truncate, and encode inputs for BERT

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [None]:
# Load Marathi-BERT-v2 tokenizer and model for sequence classification

model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: pos/neg/neutral

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_encoded = train_dataset.map(tokenize_fn, batched=True)
val_encoded   = val_dataset.map(tokenize_fn, batched=True)
test_encoded  = test_dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/24114 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define accuracy, precision, recall, and F1 score metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [None]:
# Configure training parameters such as epochs, batch size, and learning rate

training_args = TrainingArguments(
    output_dir="./mr_gt_marathi_bert_v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    fp16=True,             # if your GPU supports it
    report_to="none"
)


In [None]:
# Initialize Trainer and begin fine-tuning Marathi-BERT-v2

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.4937,0.465212,0.821667,0.820513,0.821667,0.82052
2,0.4198,0.460249,0.831333,0.830323,0.831333,0.829666
3,0.3368,0.499824,0.829667,0.828813,0.829667,0.828794


TrainOutput(global_step=4524, training_loss=0.4439069094856053, metrics={'train_runtime': 847.7223, 'train_samples_per_second': 85.337, 'train_steps_per_second': 5.337, 'total_flos': 4758537716319744.0, 'train_loss': 0.4439069094856053, 'epoch': 3.0})

In [None]:
# Evaluate the fine-tuned model on the combined test set

test_metrics = trainer.evaluate(test_encoded)
print("Test metrics:", test_metrics)


Test metrics: {'eval_loss': 0.451792448759079, 'eval_accuracy': 0.8370666666666666, 'eval_precision_macro': 0.8375709503775498, 'eval_recall_macro': 0.8370666666666665, 'eval_f1_macro': 0.8355093491891755, 'eval_runtime': 7.5784, 'eval_samples_per_second': 494.827, 'eval_steps_per_second': 31.009, 'epoch': 3.0}
