In [36]:
!pip install -q transformers datasets scikit-learn accelerate


In [37]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [38]:
# model_name = "aryanx16/mahabert-v2-daft"
model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: pos/neg/neutral


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# from google.colab import drive
# drive.mount('/content/drive')

In [40]:
train_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-train.csv")
val_df = pd.read_csv("/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-valid.csv")
test_df = pd.read_csv('/content/drive/MyDrive/L3Cube/datasets/MarathiNLP-main/L3Cube-MahaSent-MD/MahaSent_PT/tweets-test.csv')
train_df.head()


Unnamed: 0,tweet,label
0,‡§ú‡•ç‡§Ø‡•á‡§∑‡•ç‡§† ‡§™‡§§‡•ç‡§∞‡§ï‡§æ‡§∞ ‡§Ö‡§®‡§Ç‡§§ ‡§¶‡•Ä‡§ï‡•ç‡§∑‡§ø‡§§ ‡§Ø‡§æ‡§Ç‡§ö‡•ç‡§Ø‡§æ ‡§®‡§ø‡§ß‡§®‡§æ‡§ö‡•á ‡§µ...,-1
1,‡§∏‡§∞‡•ç‡§µ‡•ã‡§ö‡•ç‡§ö ‡§®‡•ç‡§Ø‡§æ‡§Ø‡§æ‡§≤‡§Ø‡§æ‡§ö‡•á ‡§®‡§ø‡§∞‡•ç‡§¶‡•á‡§∂ ‡§°‡§æ‡§µ‡§≤‡•Ç‡§® ‡§™‡•Å‡§£‡•á ‡§™‡•ã‡§≤‡§ø‡§∏...,-1
2,‡§â‡§¶‡•ç‡§ß‡§µ ‡§†‡§æ‡§ï‡§∞‡•á‡§Ç‡§®‡•Ä ‡§≠‡§æ‡§ú‡§™‡§æ‡§∏‡•ã‡§¨‡§§ ‡§Ø‡•Å‡§§‡•Ä ‡§ï‡§∞‡•Ç‡§® ‡§∂‡§ø‡§µ‡§∏‡•à‡§®‡§ø‡§ï‡§æ‡§Ç‡§ö...,-1
3,‡§Ü‡§™‡§≤‡§æ ‡§∏‡§Æ‡§æ‡§ú ‡§ñ‡•Ç‡§™ ‡§Æ‡•ã‡§†‡§æ ‡§Ü‡§π‡•á. ‡§§‡•ç‡§Ø‡§æ‡§§ ‡§Ö‡§®‡•á‡§ï ‡§ò‡§ü‡§ï ‡§Ö‡§Ç‡§§‡§∞‡•ç‡§≠‡•Ç...,1
4,‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡§æ‡§ö‡§æ ‡§¨‡§¶‡§≤‡§æ ‡§¨‡§≤‡§æ‡§§‡•ç‡§ï‡§æ‡§∞‡§æ‡§®‡•á ‡§ò‡•á‡§§‡§≤‡§æ ‡§™‡§æ‡§π‡§ø‡§ú‡•á ‡§π‡•á ‡§Æ...,-1


In [41]:
# Drop rows with missing tweet or label
train_df = train_df.dropna(subset=["tweet", "label"])
val_df   = val_df.dropna(subset=["tweet", "label"])
test_df  = test_df.dropna(subset=["tweet", "label"])

# Convert labels from (-1, 0, 1) to (0, 1, 2)
for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].astype(int)
    df["label"] = df["label"] + 1  # -1->0, 0->1, 1->2

train_df["label"].value_counts(), val_df["label"].value_counts(), test_df["label"].value_counts()


(label
 0    4038
 2    4038
 1    4038
 Name: count, dtype: int64,
 label
 2    500
 1    500
 0    500
 Name: count, dtype: int64,
 label
 2    750
 0    750
 1    750
 Name: count, dtype: int64)

In [42]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)


In [43]:
model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [44]:
def tokenize_fn(batch):
    return tokenizer(
        batch["tweet"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [45]:
train_encoded = train_dataset.map(tokenize_fn, batched=True)
val_encoded   = val_dataset.map(tokenize_fn, batched=True)
test_encoded  = test_dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/12114 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

In [46]:
def cleanup_columns(ds):
    cols_to_keep = ["input_ids", "attention_mask", "label"]
    cols_to_remove = [c for c in ds.column_names if c not in cols_to_keep]
    return ds.remove_columns(cols_to_remove)

train_encoded = cleanup_columns(train_encoded)
val_encoded   = cleanup_columns(val_encoded)
test_encoded  = cleanup_columns(test_encoded)

print(train_encoded)


Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 12114
})


In [47]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/marathi-bert-v2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [49]:
# training_args = TrainingArguments(
#     output_dir="./gt_marathi_bert_v2",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=64,
#     per_device_eval_batch_size=64,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
#     logging_steps=50,
#     fp16=True,             # if your GPU supports it
#     report_to="none"
# )


training_args = TrainingArguments(
    output_dir="./results",                   # üìÅ Where to save checkpoints
    eval_strategy="epoch",              # üß™ Evaluate after each epoch
    save_strategy="epoch",                    # üíæ Save model after each epoch
    per_device_train_batch_size=64,            # üß† Reduce memory usage
    per_device_eval_batch_size=64,             # ‚öñÔ∏è Matching batch size for eval
    num_train_epochs=2,                       # üîÅ You can try 1 or 2 for speed
    learning_rate=2e-5,                       # üöÄ Good default for BERT
    # warmup_steps=100,                         # üî• Warmup helps training stability
    weight_decay=0.01,                        # üìâ Helps avoid overfitting
    logging_dir="./logs",                     # üìÇ Logs directory
    logging_steps=50,                         # üìù Log every 50 steps
    load_best_model_at_end=True,              # üèÜ Automatically load best model
    metric_for_best_model="accuracy",         # üéØ Use accuracy to pick best
    save_total_limit=1,                       # üíæ Keep only last best model
    fp16=True,                                # ‚ö° Mixed precision for faster training
    report_to="none",                         # üìä Turn off W&B if you‚Äôre not using it
    seed=42,         # Standard seed, might be what they used
data_seed=42,
)

In [50]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [51]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8708,0.776619,0.846667,0.84684,0.846667,0.845961
2,0.714,0.689097,0.858667,0.858208,0.858667,0.858389


TrainOutput(global_step=380, training_loss=0.8345106074684545, metrics={'train_runtime': 212.2472, 'train_samples_per_second': 114.15, 'train_steps_per_second': 1.79, 'total_flos': 1593677971178496.0, 'train_loss': 0.8345106074684545, 'epoch': 2.0})

In [52]:
test_metrics = trainer.evaluate(test_encoded)
print("Test metrics:", test_metrics)


Test metrics: {'eval_loss': 0.7016482353210449, 'eval_accuracy': 0.844, 'eval_precision_macro': 0.8435901061599816, 'eval_recall_macro': 0.844, 'eval_f1_macro': 0.8436648056041323, 'eval_runtime': 4.0012, 'eval_samples_per_second': 562.328, 'eval_steps_per_second': 8.997, 'epoch': 2.0}
