<a href="https://colab.research.google.com/github/alyazone/Quranic-Theme-Extraction-Visualization/blob/main/v2_fine_tuning_transformers_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import transformers
import datasets
import torch
import sklearn
import sentence_transformers

print("Transformers version:", transformers.__version__)
print("Datasets version:", datasets.__version__)
print("Torch version:", torch.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Sentence-Transformers version:", sentence_transformers.__version__)


Transformers version: 4.46.3
Datasets version: 3.1.0
Torch version: 2.5.1+cpu
Scikit-learn version: 1.5.2
Sentence-Transformers version: 3.3.1


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
# Load your dataset
df = pd.read_csv("balanced-fine-tuning-dataset.csv")

# Concatenate Translation Verses and Refined Keywords
df["Combined Input"] = df["Translation Verses"] + " " + df["Refined Keywords"].apply(lambda x: " ".join(eval(x)))

# Save the combined dataset for fine-tuning
df[["Combined Input", "Mapped Theme"]].to_csv("fine-tuning-combined-input.csv", index=False)



In [None]:
# Load the prepared dataset
data = pd.read_csv("fine-tuning-combined-input.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)
print(dataset)


Dataset({
    features: ['Combined Input', 'Mapped Theme'],
    num_rows: 544
})


In [None]:
# Fit the LabelEncoder with unique themes
label_encoder = LabelEncoder()
label_encoder.fit(dataset["Mapped Theme"])  # Fit with unique labels in the dataset

# Encode the labels
dataset = dataset.map(lambda x: {"label": label_encoder.transform([x["Mapped Theme"]])[0]})
num_labels = len(label_encoder.classes_)
print(f"Number of labels: {num_labels}")

# Check the label mapping
print(f"Label mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

Map:   0%|          | 0/544 [00:00<?, ? examples/s]

Number of labels: 4
Label mapping: {np.str_('Forgiveness'): np.int64(0), np.str_('Gratitude'): np.int64(1), np.str_('Patience'): np.int64(2), np.str_('Truthfulness'): np.int64(3)}


In [None]:
# Load tokenizer
model_name = "distilbert-base-uncased"  # Or "all-MiniLM-L6-v2" for Sentence Transformers
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["Combined Input"], truncation=True, padding=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/544 [00:00<?, ? examples/s]

In [None]:
# Prepare labels as tensors
def prepare_labels(batch):
    batch["labels"] = torch.tensor(batch["label"], dtype=torch.long)  # Convert 'label' to tensor
    return batch

# Step 1: Split into train+validation and test
train_valid_test = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_valid = train_valid_test["train"]  # 85% of the dataset
test_dataset = train_valid_test["test"]  # 15% of the dataset

# Step 2: Split train+validation into train and validation
train_valid_split = train_valid.train_test_split(test_size=0.1765, seed=42)  # ~15% of total for validation
train_dataset = train_valid_split["train"]  # 70% of the total dataset
val_dataset = train_valid_split["test"]  # 15% of the total dataset

# Train dataset: 70% of the total dataset, used for model training.
# Validation dataset: 15% of the total dataset, used for evaluating the model during training.
# Test dataset: 15% of the total dataset, used for final evaluation after fine-tuning.

# Print the sizes of each dataset
print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}, Test size: {len(test_dataset)}")

print(train_dataset[0])  # Check the structure
print(type(train_dataset[0]["label"]))  # Should print <class 'torch.Tensor'>

Train size: 380, Validation size: 82, Test size: 82
{'Combined Input': 'It is God who made the night for you to rest, the day to make things visible. Indeed God is gracious to men, but most men are not grateful. rest grateful gracious', 'Mapped Theme': 'Gratitude', 'label': 1, 'input_ids': [101, 2009, 2003, 2643, 2040, 2081, 1996, 2305, 2005, 2017, 2000, 2717, 1010, 1996, 2154, 2000, 2191, 2477, 5710, 1012, 5262, 2643, 2003, 24665, 20113, 2000, 2273, 1010, 2021, 2087, 2273, 2024, 2025, 8794, 1012, 2717, 8794, 24665, 20113, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
# fine-tuning process begins
# Load pre-trained model for classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.922865
2,No log,0.476091
3,No log,0.384135


TrainOutput(global_step=144, training_loss=0.7711997032165527, metrics={'train_runtime': 790.944, 'train_samples_per_second': 1.441, 'train_steps_per_second': 0.182, 'total_flos': 48078066345120.0, 'train_loss': 0.7711997032165527, 'epoch': 3.0})

In [None]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_dataset)

# Print the evaluation metrics
print("Test Evaluation Results:")
print(test_results)


Test Evaluation Results:
{'eval_loss': 0.35183367133140564, 'eval_runtime': 17.2088, 'eval_samples_per_second': 4.765, 'eval_steps_per_second': 0.639, 'epoch': 3.0}


In [None]:
# verify metrics

from sklearn.metrics import accuracy_score, classification_report
import numpy as np

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)  # Convert logits to class indices
labels = predictions.label_ids

accuracy = accuracy_score(labels, preds)
print(f"Test Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(labels, preds, target_names=label_encoder.classes_))

Test Accuracy: 0.9390
Classification Report:
              precision    recall  f1-score   support

 Forgiveness       1.00      0.86      0.93        22
   Gratitude       0.88      0.94      0.91        16
    Patience       0.95      1.00      0.98        20
Truthfulness       0.92      0.96      0.94        24

    accuracy                           0.94        82
   macro avg       0.94      0.94      0.94        82
weighted avg       0.94      0.94      0.94        82



In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model("./v2_fine_tuned_theme_classifier")
tokenizer.save_pretrained("./v2_fine_tuned_theme_classifier")

('./v2_fine_tuned_theme_classifier\\tokenizer_config.json',
 './v2_fine_tuned_theme_classifier\\special_tokens_map.json',
 './v2_fine_tuned_theme_classifier\\vocab.txt',
 './v2_fine_tuned_theme_classifier\\added_tokens.json',
 './v2_fine_tuned_theme_classifier\\tokenizer.json')