In [None]:
#imports
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from google.colab import files
uploaded = files.upload()

Saving MahaSent_ST_Test.csv to MahaSent_ST_Test.csv
Saving MahaSent_ST_Train.csv to MahaSent_ST_Train.csv
Saving MahaSent_ST_Val.csv to MahaSent_ST_Val.csv
Saving tweets-test_GT.csv to tweets-test_GT.csv
Saving tweets-train_GT.csv to tweets-train_GT.csv
Saving tweets-valid_GT.csv to tweets-valid_GT.csv


In [None]:
gt_train_df = pd.read_csv("tweets-train_GT.csv")
gt_val_df = pd.read_csv("tweets-valid_GT.csv")
gt_test_df = pd.read_csv('tweets-test_GT.csv')
st_train_df = pd.read_csv("MahaSent_ST_Train.csv")
st_val_df = pd.read_csv("MahaSent_ST_Val.csv")
st_test_df = pd.read_csv('MahaSent_ST_Test.csv')

In [None]:
print(gt_train_df.head())
print("\n")
print(st_train_df.head())

                                               tweet  label  political
0          होता होता राहीलेला  निवडणूक मारो मर्ज़ीभई     -1          0
1                         खरा लखोबा तर हा बोबडाच आहे     -1         -1
2  मुंबईतील घाटकोपरमध्ये धुळवड खेळून घरी परतलेलं ...     -1          0
3      वेडाबाई भूतकाळ बघ लोक शेन घालतात आणी दांडा ही     -1         -1
4  कुर्ला वाहतुक विभागाला फक्त हे पाठवले जाते पण ...     -1         -1


   Unnamed: 0                                  marathi_text  label
0           0                              मी तो मूर्ख आहे.     -1
1           1  आईने आत्महत्या केली आणि माझे वडील पळून गेले.     -1
2           2                   - त्यांनी फक्त जॉयला मारले?     -1
3           3            मला माफ करा मी आज तिथेच केले नाही.     -1
4           4                            नाही, नक्कीच नाही.     -1


In [None]:
# Standardize column names by renaming text fields to 'text'

#Training
gt_train_df = gt_train_df.rename(columns={"tweet": "text"})
st_train_df = st_train_df.rename(columns={"marathi_text": "text"})
gt_train_df = gt_train_df[["text", "label"]]
st_train_df = st_train_df[["text", "label"]]

# Combine MR and PT datasets into a single train dataframe
train_df = pd.concat([gt_train_df, st_train_df], ignore_index=True)

# Shuffle the datasets to avoid ordering bias during training
train_df = train_df.sample(frac=1, random_state=42)

In [None]:
#similarly for validation and test
# Validation
gt_val_df = gt_val_df.rename(columns={"tweet": "text"})
st_val_df = st_val_df.rename(columns={"marathi_text": "text"})
val_df = pd.concat([gt_val_df[["text", "label"]], st_val_df[["text", "label"]]], ignore_index=True)
val_df = val_df.sample(frac=1, random_state=42)
# Test
gt_test_df = gt_test_df.rename(columns={"tweet": "text"})
st_test_df = st_test_df.rename(columns={"marathi_text": "text"})
test_df = pd.concat([gt_test_df[["text", "label"]], st_test_df[["text", "label"]]], ignore_index=True)
test_df = test_df.sample(frac=1, random_state=42)

In [None]:
# Convert original sentiment labels to numeric IDs for model training
# Also ensure 'text' column contains only strings and handle NaN values

for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].astype(int)
    df["label"] = df["label"] + 1  # -1->0, 0->1, 1->2
    df["text"] = df["text"].astype(str).fillna("") # Ensure text is string and handle NaNs

In [None]:
# Convert pandas dataframes into HuggingFace Dataset objects

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

In [None]:
# Tokenize text: pad, truncate, and encode inputs for BERT

def tokenize_fn(batch):
    # Ensure each text item is explicitly a string within a list for tokenizer compatibility
    texts = [str(x) for x in batch["text"]]
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [None]:
# Load Marathi-BERT-v2 tokenizer and model for sequence classification

model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: pos/neg/neutral

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_encoded = train_dataset.map(tokenize_fn, batched=True)
val_encoded   = val_dataset.map(tokenize_fn, batched=True)
test_encoded  = test_dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define accuracy, precision, recall, and F1 score metrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [None]:
# Configure training parameters such as epochs, batch size, and learning rate

training_args = TrainingArguments(
    output_dir="./mr_gt_marathi_bert_v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    fp16=True,             # if your GPU supports it
    report_to="none"
)


In [None]:
# Initialize Trainer and begin fine-tuning Marathi-BERT-v2

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.5295,0.518233,0.793667,0.802323,0.793667,0.796228
2,0.4599,0.4838,0.807,0.807377,0.807,0.807128
3,0.3937,0.520829,0.806,0.804783,0.806,0.805268


TrainOutput(global_step=4500, training_loss=0.5045495851304796, metrics={'train_runtime': 1143.1092, 'train_samples_per_second': 62.986, 'train_steps_per_second': 3.937, 'total_flos': 4736041519104000.0, 'train_loss': 0.5045495851304796, 'epoch': 3.0})

In [None]:
gt_test_df_processed = gt_test_df.copy()
gt_test_df_processed['label'] = gt_test_df_processed['label'].astype(int)
gt_test_df_processed['label'] = gt_test_df_processed['label'] + 1 # -1->0, 0->1, 1->2
gt_test_df_processed['text'] = gt_test_df_processed['text'].astype(str).fillna('')
gt_test_dataset = Dataset.from_pandas(gt_test_df_processed)
gt_test_encoded = gt_test_dataset.map(tokenize_fn, batched=True)

gt_test_metrics = trainer.evaluate(gt_test_encoded)
print("MahaSent_GT Test metrics:", gt_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_GT Test metrics: {'eval_loss': 0.5047086477279663, 'eval_accuracy': 0.7973333333333333, 'eval_precision_macro': 0.7982170249950084, 'eval_recall_macro': 0.7973333333333333, 'eval_f1_macro': 0.7975865447719787, 'eval_runtime': 4.9032, 'eval_samples_per_second': 305.925, 'eval_steps_per_second': 19.171, 'epoch': 3.0}


In [None]:
st_test_df_processed = st_test_df.copy()
st_test_df_processed = st_test_df_processed.rename(columns={"marathi_text": "text"})
st_test_df_processed['label'] = st_test_df_processed['label'].astype(int)
st_test_df_processed['label'] = st_test_df_processed['label'] + 1 # -1->0, 0->1, 1->2
st_test_df_processed['text'] = st_test_df_processed['text'].astype(str).fillna('')
st_test_dataset = Dataset.from_pandas(st_test_df_processed)
st_test_encoded = st_test_dataset.map(tokenize_fn, batched=True)

print("MahaSent_ST Test DataFrame processed and encoded successfully.")
st_test_metrics = trainer.evaluate(st_test_encoded)
print("MahaSent_ST Test metrics:", st_test_metrics)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_ST Test DataFrame processed and encoded successfully.


MahaSent_ST Test metrics: {'eval_loss': 0.5055975914001465, 'eval_accuracy': 0.8026666666666666, 'eval_precision_macro': 0.8039079831299292, 'eval_recall_macro': 0.8026666666666666, 'eval_f1_macro': 0.8031978757743392, 'eval_runtime': 5.2139, 'eval_samples_per_second': 287.695, 'eval_steps_per_second': 18.029, 'epoch': 3.0}


In [None]:
uploaded = files.upload()

Saving MahaSent_MR_Test.csv to MahaSent_MR_Test.csv
Saving tweets-test_PT.csv to tweets-test_PT.csv


In [None]:
mr_test_df = pd.read_csv('MahaSent_MR_Test.csv')
print("MahaSent_PT Test DataFrame loaded successfully.")
print(mr_test_df.head())

mr_test_df_processed = mr_test_df.copy()
mr_test_df_processed = mr_test_df_processed.rename(columns={"marathi_sentence": "text"})
mr_test_df_processed['label'] = mr_test_df_processed['label'].astype(int)
mr_test_df_processed['label'] = mr_test_df_processed['label'] + 1
mr_test_df_processed['text'] = mr_test_df_processed['text'].astype(str).fillna('')
mr_test_dataset = Dataset.from_pandas(mr_test_df_processed)
mr_test_encoded = mr_test_dataset.map(tokenize_fn, batched=True)
print("MahaSent_MR Test DataFrame processed and encoded successfully.")

mr_test_metrics = trainer.evaluate(mr_test_encoded)
print("MahaSent_MR Test metrics:", mr_test_metrics)

MahaSent_PT Test DataFrame loaded successfully.
   Unnamed: 0                                   marathi_sentence  label
0           0  जयदीप त्याच्या आई संहिता हिच्या घटस्फोटाचा भार...     -1
1           1                  मात्र तो त्यात यशस्वी झालेला नाही     -1
2           2  जेव्हा लोकांचा समूह त्यांच्या ड्रॉईंग रूमच्या ...     -1
3           3  दिग्दर्शकाची चित्रपटावरची पकडच पक्की नसल्यानं ...     -1
4           4  तो झिपऱ्याचे जग उत्तम प्रकारे निर्माण करत असता...     -1


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

MahaSent_MR Test DataFrame processed and encoded successfully.


MahaSent_MR Test metrics: {'eval_loss': 0.6709393262863159, 'eval_accuracy': 0.7393333333333333, 'eval_precision_macro': 0.7385103596551782, 'eval_recall_macro': 0.7393333333333333, 'eval_f1_macro': 0.737932975988278, 'eval_runtime': 4.5311, 'eval_samples_per_second': 331.047, 'eval_steps_per_second': 20.746, 'epoch': 3.0}


In [None]:
pt_test_df = pd.read_csv('tweets-test_PT.csv')
pt_test_df_processed = pt_test_df.copy()
pt_test_df_processed = pt_test_df_processed.rename(columns={"tweet": "text"})
pt_test_df_processed['label'] = pt_test_df_processed['label'].astype(int)
pt_test_df_processed['label'] = pt_test_df_processed['label'] + 1
pt_test_df_processed['text'] = pt_test_df_processed['text'].astype(str).fillna('')
pt_test_dataset = Dataset.from_pandas(pt_test_df_processed)
pt_test_encoded = pt_test_dataset.map(tokenize_fn, batched=True)

pt_test_metrics = trainer.evaluate(pt_test_encoded)
print("MahaSent_PT Test metrics:", pt_test_metrics)


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

MahaSent_PT Test metrics: {'eval_loss': 0.5806778073310852, 'eval_accuracy': 0.7693333333333333, 'eval_precision_macro': 0.780444632821244, 'eval_recall_macro': 0.7693333333333333, 'eval_f1_macro': 0.7660493241670844, 'eval_runtime': 8.2972, 'eval_samples_per_second': 271.177, 'eval_steps_per_second': 16.994, 'epoch': 3.0}
