## 📦 Step 1: Setup & Installation
Install required dependencies.

In [None]:
# >>> Step 1: Install dependencies
!pip install -q sentence-transformers datasets pyarrow huggingface_hub

## 🔧 Step 2: Import Libraries & Load Pretrained Model

In [None]:
# >>> Step 2: Import libraries & load pretrained model
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader

model_name = "BAAI/bge-base-en-v1.5"
model = SentenceTransformer(model_name)
print("Model loaded:", model_name)

## 📂 Step 3: Load & Preprocess Dataset

In [None]:
# >>> Step 3: Load and preprocess dataset
dataset = load_dataset("parquet", data_files={"train": "train-00000-of-00001.parquet"}, split="train")

if "query" in dataset.column_names and "question" not in dataset.column_names:
    dataset = dataset.rename_column("query", "question")

instruction = "Represent this sentence for searching relevant passages: "
train_examples = [InputExample(texts=[instruction + r["question"], r["answer"]]) for r in dataset]

print("Dataset loaded:", len(train_examples), "examples")

## ⚙️ Step 4: Setup Training

In [None]:
# >>> Step 4: Setup training components
batch_size = 32
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)

## 🏋️ Step 5: Fine-Tune the Model

In [None]:
# >>> Step 5: Fine-tune the model
num_epochs = 1
output_path = "../saved_models/fine-tuned-bge-qna"
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=output_path,
    show_progress_bar=True
)

## 📊 Step 6: Evaluate the Fine-Tuned Model

In [None]:
# >>> Step 6: Evaluate the fine-tuned model
fine_tuned_model = SentenceTransformer(output_path)

test_query = "What is the SI unit for the electric field?"
passages = [
    "The units of the electric field in the SI system are newtons per coulomb (N/C), or volts per meter (V/m).",
    "The primary colors of light are red, green, and blue.",
    "A CPU is the electronic circuitry that executes instructions comprising a computer program."
]

instructed_query = instruction + test_query
query_embedding = fine_tuned_model.encode(instructed_query)
passage_embeddings = fine_tuned_model.encode(passages)

similarities = util.cos_sim(query_embedding, passage_embeddings)

for score, passage in zip(similarities[0], passages):
    print(f"Similarity: {score:.4f} | Passage: {passage}")

## ☁️ Step 7: Push Model to Hugging Face Hub

In [None]:
# >>> Step 7: Push model to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

hf_username = "your-hf-username"
model_id = f"{hf_username}/bge-base-my-qna-model"

fine_tuned_model.save_to_hub(model_id, commit_message="Fine-tuned BGE-base on Q&A dataset")

## 🔍 Step 8: Inference from Hugging Face Hub

In [None]:
# >>> Step 8: Inference from Hugging Face Hub
hub_model = SentenceTransformer(model_id)

query = instruction + "What is the powerhouse of the cell?"
passages = [
    "Mitochondria are organelles often called the powerhouse of the cell.",
    "The cell wall provides structural support to plant cells.",
    "DNA contains genetic instructions."
]

query_emb = hub_model.encode(query)
pass_emb = hub_model.encode(passages)
similarities = util.cos_sim(query_emb, pass_emb)

for score, passage in zip(similarities[0], passages):
    print(f"Similarity: {score:.4f} | Passage: {passage}")