In [None]:
# 1. Install necessary libraries
!pip -q install transformers datasets mlflow evaluate scikit-learn numpy pandas
# Note: For GPU support, ensure torch is installed with CUDA capability

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.8/76.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import mlflow
from transformers.integrations import MLflowCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,f1_score


In [None]:
# Load your pre-split datasets
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df


Unnamed: 0,sentences,sentiment
0,पछिल्लो घण्टामा उपत्यकामा थपिए भन्दा बढि कोभिड...,0
1,कोटी देवीदेउता भएको हिन्दुस्तानमा कोभिड संक्रम...,0
2,बेलायतमा कोरोना भाइरस कोभिड बाट मृत्यु हुनेको ...,0
3,कोभिड बाट बच्न विश्व बैंकले नेपाललाई साढे अर्ब...,2
4,कम्युनिष्ट आन्दोलनका संस्थापक सदस्य नेकपा का क...,0
...,...,...
22899,मस्कोमा बस्ने डा महेश सिंह श्रेष्ठले कोभिड बिर...,1
22900,कोभिड तरकारी पसल बन्द,0
22901,चीनको वुहानबाट फैलिन शुरू गरेको कोभिड विश्वव्य...,0
22902,एनआरएनए कोभिड तथ्याङ्क विदेशमा हजार जना नेपाली...,2


In [None]:
print(train_df['sentiment'].value_counts(),val_df['sentiment'].value_counts(),test_df['sentiment'].value_counts())

sentiment
2    10134
0     9182
1     3588
Name: count, dtype: int64 sentiment
2    2541
0    2330
1     856
Name: count, dtype: int64 sentiment
2    3205
0    2896
1    1057
Name: count, dtype: int64


In [None]:

#Ensure labels are integers
train_df['sentiment'] = train_df['sentiment'].astype(int)
val_df['sentiment'] = val_df['sentiment'].astype(int)
test_df['sentiment'] = test_df['sentiment'].astype(int)

#Rename label column from 'sentiment' to 'labels'
train_df = train_df.rename(columns={'sentiment': 'labels'})
val_df = val_df.rename(columns={'sentiment': 'labels'})
test_df = test_df.rename(columns={'sentiment': 'labels'})

#Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "Shushant/nepaliBERT"
MAX_LENGTH = 128
NUM_LABELS=3
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples['sentences'],
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH
    )

# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/22904 [00:00<?, ? examples/s]

Map:   0%|          | 0/5727 [00:00<?, ? examples/s]

Map:   0%|          | 0/7158 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset

Dataset({
    features: ['sentences', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 22904
})

In [None]:
#Remove the original 'sentences' column since it's no longer needed for training
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["sentences"])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["sentences"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["sentences"])

In [None]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support,f1_score
def compute_metrics(eval_pred):
    """
    Custom evaluation function for sequence classification.
    Computes Macro F1-score and Accuracy for sequence classification.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Macro F1-score (PRIMARY METRIC)
    macro_f1 = f1_score(labels, predictions, average='macro')

    # Accuracy (SECONDARY METRIC)
    accuracy = accuracy_score(labels, predictions)

    return {
        'macro_f1': macro_f1,
        'accuracy': accuracy,
    }

In [None]:
# Load the base BERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)
# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    learning_rate=2e-5,
    report_to=["mlflow"],
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Shushant/nepaliBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                                     # fine-tuned model
    args=training_args,                              # training arguments
    train_dataset=tokenized_train_dataset,           # Training dataset
    eval_dataset=tokenized_val_dataset,              # Validation dataset
    tokenizer=tokenizer,                             # The tokenizer for saving
    compute_metrics=compute_metrics ,                 # The custom metrics
    callbacks=[MLflowCallback()]                     # For logging to MLflow
)

# Define your classes
classes = ["negative", "neutral", "positive"]

with mlflow.start_run(run_name="BERT_Finetune_Nepali"):
    # Log extra info
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("num_labels", NUM_LABELS)
    mlflow.log_param("classes", classes)

    print("Starting fine-tuning...")
    trainer.train()

    metric = trainer.evaluate()
    mlflow.log_metrics(metric)

    # Save the final best model
    trainer.save_model("./saved_model")
    tokenizer.save_pretrained("./saved_model")

    # Log the saved model as an MLflow artifact
    mlflow.log_artifacts("./saved_model", artifact_path="model")

    print("Best fine-tuned model and tokenizer saved to ./saved_model")


  trainer = Trainer(
You are adding a <class 'transformers.integrations.integration_utils.MLflowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
MLflowCallback


Starting fine-tuning...


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy
1,0.6074,0.615309,0.687792,0.746813
2,0.4621,0.660247,0.671359,0.75502
3,0.3187,0.720378,0.687512,0.749782


Best fine-tuned model and tokenizer saved to ./saved_model


In [None]:
# # Load the best model for final evaluation
# final_model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
# final_trainer = Trainer(model=final_model, args=training_args, compute_metrics=compute_metrics)

# # Run evaluation on the unseen Test Set
# test_results = final_trainer.evaluate(tokenized_test_dataset)

# print("\n--- Final Evaluation Results (Unseen Test Set) ---")
# print(f"Macro F1-score (Primary Metric): {test_results['eval_macro_f1']:.4f}")
# print(f"Accuracy (Secondary Metric): {test_results['eval_accuracy']:.4f}")
# print("--------------------------------------------------")