In [None]:
!pip install -q transformers datasets scikit-learn accelerate


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
mr_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Train.csv")
mr_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Val.csv")
mr_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_MR/MahaSent_MR_Test.csv')
st_train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Train.csv")
st_val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Val.csv")
st_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_ST/MahaSent_ST_Test.csv')

In [None]:
print(mr_train_df.head())
print("\n")
print(st_train_df.head())

   Unnamed: 0                                   marathi_sentence  label
0           0  माने यांचा घटस्फोट झाला आहे तर मोहितेने नुकतेच...     -1
1           1  एका रात्रीत घडणारी किंबहुना बिघडणारी ही गोष्ट आहे     -1
2           2  जरी आघात समजण्यायोग्य आहे जरी चित्रपटाला खराब ...     -1
3           3  पण तो असा आघातही अनुभवत आहे की तो कोणाशीही शेअ...     -1
4           4               छोटे-छोटे गैरसमज मोठ्या अडचणीत येतात     -1


   Unnamed: 0                                  marathi_text  label
0           0                              मी तो मूर्ख आहे.     -1
1           1  आईने आत्महत्या केली आणि माझे वडील पळून गेले.     -1
2           2                   - त्यांनी फक्त जॉयला मारले?     -1
3           3            मला माफ करा मी आज तिथेच केले नाही.     -1
4           4                            नाही, नक्कीच नाही.     -1


In [None]:
# Standardize column names by renaming text fields to 'text'

#Training
mr_train_df = mr_train_df.rename(columns={"marathi_sentence": "text"})
st_train_df = st_train_df.rename(columns={"marathi_text": "text"})
mr_train_df = mr_train_df[["text", "label"]]
st_train_df = st_train_df[["text", "label"]]

# Combine MR and PT datasets into a single train dataframe
train_df = pd.concat([mr_train_df, st_train_df], ignore_index=True)

# Shuffle the datasets to avoid ordering bias during training
train_df = train_df.sample(frac=1, random_state=42)

In [None]:
#similarly for validation and test
# Validation
mr_val_df = mr_val_df.rename(columns={"marathi_sentence": "text"})
st_val_df = st_val_df.rename(columns={"marathi_text": "text"})
val_df = pd.concat([mr_val_df[["text", "label"]], st_val_df[["text", "label"]]], ignore_index=True)
val_df = val_df.sample(frac=1, random_state=42)
# Test
mr_test_df = mr_test_df.rename(columns={"marathi_sentence": "text"})
st_test_df = st_test_df.rename(columns={"marathi_text": "text"})
test_df = pd.concat([mr_test_df[["text", "label"]], st_test_df[["text", "label"]]], ignore_index=True)
test_df = test_df.sample(frac=1, random_state=42)

In [None]:
# Convert original sentiment labels to numeric IDs for model training
# Also ensure 'text' column contains only strings and handle NaN values

for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].astype(int)
    df["label"] = df["label"] + 1  # -1->0, 0->1, 1->2
    df["text"] = df["text"].astype(str).fillna("") # Ensure text is string and handle NaNs

In [None]:
# Convert pandas dataframes into HuggingFace Dataset objects

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)


In [None]:
# Tokenize text: pad, truncate, and encode inputs for BERT

def tokenize_fn(batch):
    # Ensure each text item is explicitly a string within a list for tokenizer compatibility
    texts = [str(x) for x in batch["text"]]
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [None]:
# Load Marathi-BERT-v2 tokenizer and model for sequence classification

model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: pos/neg/neutral

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_encoded = train_dataset.map(tokenize_fn, batched=True)
val_encoded   = val_dataset.map(tokenize_fn, batched=True)
test_encoded  = test_dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define accuracy, precision, recall, and F1 score metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [None]:
# Configure training parameters such as epochs, batch size, and learning rate

training_args = TrainingArguments(
    output_dir="./mr_gt_marathi_bert_v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    fp16=True,             # if your GPU supports it
    report_to="none"
)


In [None]:
# Initialize Trainer and begin fine-tuning Marathi-BERT-v2

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.5149,0.551623,0.789,0.789229,0.789,0.787611
2,0.4887,0.500861,0.809333,0.808019,0.809333,0.808238
3,0.4158,0.535234,0.809,0.80795,0.809,0.808074


TrainOutput(global_step=4500, training_loss=0.5135474544101292, metrics={'train_runtime': 971.5491, 'train_samples_per_second': 74.108, 'train_steps_per_second': 4.632, 'total_flos': 4736041519104000.0, 'train_loss': 0.5135474544101292, 'epoch': 3.0})

In [None]:
# Evaluate the fine-tuned model on the combined test set

test_metrics = trainer.evaluate(test_encoded)
print("Test metrics:", test_metrics)


Test metrics: {'eval_loss': 0.5146094560623169, 'eval_accuracy': 0.802, 'eval_precision_macro': 0.8008330249570936, 'eval_recall_macro': 0.8019999999999999, 'eval_f1_macro': 0.8010959441302865, 'eval_runtime': 6.1798, 'eval_samples_per_second': 485.455, 'eval_steps_per_second': 30.422, 'epoch': 3.0}


### **Evaluate on MahaSent_MR Test Dataset**

In [None]:
mr_test_df_processed = mr_test_df.copy()
mr_test_df_processed['label'] = mr_test_df_processed['label'].astype(int)
mr_test_df_processed['label'] = mr_test_df_processed['label'] + 1 # -1->0, 0->1, 1->2
mr_test_df_processed['text'] = mr_test_df_processed['text'].astype(str).fillna('')
mr_test_dataset = Dataset.from_pandas(mr_test_df_processed)
mr_test_encoded = mr_test_dataset.map(tokenize_fn, batched=True)

mr_test_metrics = trainer.evaluate(mr_test_encoded)
print("MahaSent_MR Test metrics:", mr_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_MR Test metrics: {'eval_loss': 0.5167713165283203, 'eval_accuracy': 0.8093333333333333, 'eval_precision_macro': 0.8085383002310143, 'eval_recall_macro': 0.8093333333333333, 'eval_f1_macro': 0.8080480989794324, 'eval_runtime': 3.9819, 'eval_samples_per_second': 376.702, 'eval_steps_per_second': 23.607, 'epoch': 3.0}


### **Evaluate on MahaSent_GT Test Dataset**

In [None]:
gt_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_GT/tweets-test.csv')
print("MahaSent_GT Test DataFrame loaded successfully.")
# print(st_test_df.head())
gt_test_df_processed = gt_test_df.copy()
gt_test_df_processed = gt_test_df_processed.rename(columns={"tweet": "text"})
gt_test_df_processed['label'] = gt_test_df_processed['label'].astype(int)
gt_test_df_processed['label'] = gt_test_df_processed['label'] + 1 # -1->0, 0->1, 1->2
gt_test_df_processed['text'] = gt_test_df_processed['text'].astype(str).fillna('')
gt_test_dataset = Dataset.from_pandas(gt_test_df_processed)
gt_test_encoded = gt_test_dataset.map(tokenize_fn, batched=True)

gt_test_metrics = trainer.evaluate(gt_test_encoded)
print("MahaSent_GT Test metrics:", gt_test_metrics)

MahaSent_GT Test DataFrame loaded successfully.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_GT Test metrics: {'eval_loss': 0.6722768545150757, 'eval_accuracy': 0.73, 'eval_precision_macro': 0.73667910323389, 'eval_recall_macro': 0.73, 'eval_f1_macro': 0.7305521041355373, 'eval_runtime': 3.089, 'eval_samples_per_second': 485.589, 'eval_steps_per_second': 30.43, 'epoch': 3.0}


### **Evaluate on MahaSent_ST Test Dataset**

In [None]:
st_test_df_processed = st_test_df.copy()
st_test_df_processed = st_test_df_processed.rename(columns={"marathi_text": "text"})
st_test_df_processed['label'] = st_test_df_processed['label'].astype(int)
st_test_df_processed['label'] = st_test_df_processed['label'] + 1 # -1->0, 0->1, 1->2
st_test_df_processed['text'] = st_test_df_processed['text'].astype(str).fillna('')
st_test_dataset = Dataset.from_pandas(st_test_df_processed)
st_test_encoded = st_test_dataset.map(tokenize_fn, batched=True)
print("MahaSent_ST Test DataFrame processed and encoded successfully.")

st_test_metrics = trainer.evaluate(st_test_encoded)
print("MahaSent_ST Test metrics:", st_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_ST Test DataFrame processed and encoded successfully.


MahaSent_ST Test metrics: {'eval_loss': 0.5124478340148926, 'eval_accuracy': 0.7946666666666666, 'eval_precision_macro': 0.7943437304298531, 'eval_recall_macro': 0.7946666666666666, 'eval_f1_macro': 0.794427145362472, 'eval_runtime': 4.81, 'eval_samples_per_second': 311.849, 'eval_steps_per_second': 19.543, 'epoch': 3.0}


### **Evaluate on MahaSent_PT Test Dataset**

In [None]:
pt_test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/L3Cube-MahaSent-MD/MahaSent_PT/tweets-test.csv')
print("MahaSent_PT Test DataFrame loaded successfully.")
# print(pt_test_df.head())

pt_test_df_processed = pt_test_df.copy()
pt_test_df_processed = pt_test_df_processed.rename(columns={"tweet": "text"})
pt_test_df_processed['label'] = pt_test_df_processed['label'].astype(int)
pt_test_df_processed['label'] = pt_test_df_processed['label'] + 1 # -1->0, 0->1, 1->2
pt_test_df_processed['text'] = pt_test_df_processed['text'].astype(str).fillna('')
pt_test_dataset = Dataset.from_pandas(pt_test_df_processed)
pt_test_encoded = pt_test_dataset.map(tokenize_fn, batched=True)

print("MahaSent_PT Test DataFrame processed and encoded successfully.")

pt_test_metrics = trainer.evaluate(pt_test_encoded)
print("MahaSent_PT Test metrics:", pt_test_metrics)

MahaSent_PT Test DataFrame loaded successfully.


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

MahaSent_PT Test DataFrame processed and encoded successfully.


MahaSent_PT Test metrics: {'eval_loss': 0.5635440349578857, 'eval_accuracy': 0.7768888888888889, 'eval_precision_macro': 0.7845720442013681, 'eval_recall_macro': 0.7768888888888889, 'eval_f1_macro': 0.776997891312036, 'eval_runtime': 4.5336, 'eval_samples_per_second': 496.291, 'eval_steps_per_second': 31.101, 'epoch': 3.0}


In [None]:
print(f"MahaSent_MR Test Accuracy: {mr_test_metrics['eval_accuracy']:.4f}")
print(f"MahaSent_PT Test Accuracy: {pt_test_metrics['eval_accuracy']:.4f}")
print(f"MahaSent_GT Test Accuracy: {gt_test_metrics['eval_accuracy']:.4f}")
print(f"MahaSent_ST Test Accuracy: {st_test_metrics['eval_accuracy']:.4f}")

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

print("Train Dataset:", train_dataset)
print("Validation Dataset:", val_dataset)
print("Test Dataset:", test_dataset)

Train Dataset: Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 24000
})
Validation Dataset: Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 3000
})
Test Dataset: Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 3000
})
