Connect to Goodle Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', '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')

In [None]:
import ast
import pandas as pd
import numpy as np
from collections import Counter
import time
import json
from tqdm.auto import tqdm
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

In [None]:
CSV_PATH = "/content/drive/MyDrive/HNDResearch/lemmatized_dataset.csv"
print("Loading:", CSV_PATH)
df = pd.read_csv(CSV_PATH)

Loading: /content/drive/MyDrive/HNDResearch/lemmatized_dataset.csv


In [None]:
def safe_parse_list(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    try:
        parsed = ast.literal_eval(x)
        if isinstance(parsed, (list, tuple)):
            return list(parsed)
        if isinstance(parsed, str):
            return parsed.split()
    except Exception:
        if isinstance(x, str):
            xx = x.strip().strip("[]")
            if "," in xx:
                parts = [p.strip().strip("'\"") for p in xx.split(",") if p.strip()]
                return parts
            return [p.strip().strip("'\"") for p in xx.split() if p.strip()]
    return []

In [None]:
if 'tokenized' in df.columns:
    df['tokenized_parsed'] = df['tokenized'].apply(safe_parse_list)
else:
    df['tokenized_parsed'] = [[] for _ in range(len(df))]

In [None]:
if 'lemmatized' in df.columns:
    df['lemmatized_parsed'] = df['lemmatized'].apply(safe_parse_list)
else:
    df['lemmatized_parsed'] = [[] for _ in range(len(df))]

In [None]:
if 'chunk_length' in df.columns:
    df['chunk_length'] = pd.to_numeric(df['chunk_length'], errors='coerce').fillna(0).astype(int)
if 'chunk_id' in df.columns:
    try:
        df['chunk_id'] = pd.to_numeric(df['chunk_id'], errors='coerce')
    except:
        pass

In [None]:
# quick derived columns
df['token_count'] = df['tokenized_parsed'].apply(len)
df['lemma_count'] = df['lemmatized_parsed'].apply(len)
df['lemmatized_text'] = df['lemmatized_parsed'].apply(lambda toks: " ".join(toks))

In [None]:
print("\nDataframe shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 3 rows (selected columns):")
display(df.loc[:, ['file_name', 'chunk_id', 'chunk_length', 'token_count', 'lemma_count']].head(3))


Dataframe shape: (6100, 12)

Columns: ['file_name', 'chunk_id', 'chunk_text', 'chunk_length', 'chunk_text_no_punct', 'tokenized', 'lemmatized', 'tokenized_parsed', 'lemmatized_parsed', 'token_count', 'lemma_count', 'lemmatized_text']

First 3 rows (selected columns):


Unnamed: 0,file_name,chunk_id,chunk_length,token_count,lemma_count
0,akira_article_1001_Affordable_Lungi_Tops_and_D...,1,500,80,79
1,akira_article_1001_Affordable_Lungi_Tops_and_D...,2,500,83,83
2,akira_article_1001_Affordable_Lungi_Tops_and_D...,3,500,92,90


In [None]:
print("\nBasic stats:")
print(df[['chunk_length', 'token_count', 'lemma_count']].describe())


Basic stats:
       chunk_length  token_count  lemma_count
count   6100.000000  6100.000000  6100.000000
mean     461.721803    78.994426    76.817541
std      105.827164    20.084074    19.665701
min        1.000000     0.000000     0.000000
25%      500.000000    78.000000    76.000000
50%      500.000000    84.000000    82.000000
75%      500.000000    90.000000    87.000000
max      500.000000   114.000000   107.000000


In [None]:
required_cols = ['file_name', 'chunk_id', 'lemmatized_text']
missing = [c for c in required_cols if c not in df.columns]
if missing:
    print("\nWarning: missing required columns:", missing)
else:
    print("\nAll required columns present for next steps.")


All required columns present for next steps.


In [None]:
OUT_PROCESSED = "/content/drive/MyDrive/HNDResearch/lemmatized_dataset_processed_step1.csv"
df.to_csv(OUT_PROCESSED, index=False)
print("\nSaved processed snapshot to:", OUT_PROCESSED)


Saved processed snapshot to: /content/drive/MyDrive/HNDResearch/lemmatized_dataset_processed_step1.csv


Create embeddings, Chroma DB, BM25, and retrieval functions

In [None]:
!pip install -q chromadb sentence-transformers rank_bm25 faiss-cpu transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import time
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from rank_bm25 import BM25Okapi
from collections import defaultdict
from typing import List, Tuple

In [None]:
DRIVE_CHROMA_DIR = "/content/drive/MyDrive/HNDResearch/chroma_db"
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
RERANKER_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
TOP_K = 8

In [None]:
os.makedirs(DRIVE_CHROMA_DIR, exist_ok=True)

In [None]:
N = 5400

texts = df['lemmatized_text'].fillna("").astype(str).tolist()[:N]
metadatas = df[['file_name', 'chunk_id', 'chunk_length']].to_dict(orient='records')[:N]
ids = [f"doc_{i}" for i in range(len(texts))]

In [None]:
print("Documents:", len(texts))

Documents: 5400


In [None]:
embedder = SentenceTransformer(EMBED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
BATCH = 256
embeddings = []
for i in range(0, len(texts), BATCH):
    batch_texts = texts[i:i+BATCH]
    emb = embedder.encode(batch_texts, show_progress_bar=False, normalize_embeddings=True)
    embeddings.append(emb)
embeddings = np.vstack(embeddings)
print("Embeddings shape:", embeddings.shape)

Embeddings shape: (5400, 384)


In [None]:
client = chromadb.PersistentClient(path=DRIVE_CHROMA_DIR)

In [None]:
collection = client.get_or_create_collection(
    name="fashion_lemmata",
    metadata={"source": "lemmatized_dataset"}
)

In [None]:
collection.add(
    ids=ids,
    metadatas=metadatas,
    documents=texts,
    embeddings=embeddings.tolist()
)

print("Chroma DB built and persisted at:", DRIVE_CHROMA_DIR)

Chroma DB built and persisted at: /content/drive/MyDrive/HNDResearch/chroma_db


In [None]:
# ---------- 4) Build BM25 sparse index ----------
# Pre-tokenize texts for BM25 (simple whitespace tokenization)
tokenized_for_bm25 = [t.split() for t in texts]
bm25 = BM25Okapi(tokenized_for_bm25)
print("BM25 index built.")

BM25 index built.


In [None]:
from sentence_transformers import CrossEncoder
try:
    reranker = CrossEncoder(RERANKER_NAME)
    print("Reranker loaded:", RERANKER_NAME)
except Exception as e:
    print("Could not load reranker (will skip reranking). Error:", e)
    reranker = None

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Reranker loaded: cross-encoder/ms-marco-MiniLM-L-6-v2


Retrieval helper functions

In [None]:
import math
from numpy.linalg import norm

In [None]:
def dense_retrieve(query: str, k: int = TOP_K) -> Tuple[List[str], List[float], float]:
    """
    Dense retrieval via Chroma (embedding similarity).
    Returns: (list of ids, list of similarity scores, latency_seconds)
    """
    t0 = time.time()
    q_emb = embedder.encode([query], normalize_embeddings=True)[0]
    # Use chroma query
    results = collection.query(query_embeddings=[q_emb.tolist()], n_results=k, include=['metadatas','distances','documents'])
    t1 = time.time()
    latency = t1 - t0
    # Chroma returns squared distances for some impls — we will convert to cosine-like score: 1 - distance (approx)
    hits = results['ids'][0]
    distances = results['distances'][0]
    # convert distances to scores (higher = better)
    scores = [1.0 - d for d in distances]
    return hits, scores, latency



In [None]:
def sparse_retrieve(query: str, k: int = TOP_K) -> Tuple[List[str], List[float], float]:
    """
    Sparse retrieval using BM25.
    Returns top-k ids and BM25 scores and latency.
    """
    t0 = time.time()
    q_tokens = query.split()
    scores = bm25.get_scores(q_tokens)  # array len = docs
    top_indices = np.argsort(scores)[::-1][:k]
    hits = [ids[i] for i in top_indices]
    top_scores = [float(scores[i]) for i in top_indices]
    latency = time.time() - t0
    return hits, top_scores, latency


In [None]:
# ---- Fixed hybrid_retrieve using dense_retrieve (works with current Chroma) ----
def hybrid_retrieve(query: str, k: int = TOP_K, alpha: float = 0.5, dense_top_m: int = 100):
    """
    Hybrid retrieval by:
      1) getting dense_top_m candidates from dense_retrieve (which returns ids & scores),
      2) computing BM25 scores for those candidates,
      3) normalizing and fusing (alpha * dense + (1-alpha) * bm25),
      4) returning top-k fused candidates with latency.
    """
    t0 = time.time()
    # 1) get dense candidates (this returns ids like 'doc_123' and dense similarity scores)
    dense_ids, dense_scores, dense_lat = dense_retrieve(query, k=dense_top_m)
    # 2) compute BM25 scores for the *same* candidate set
    bm25_scores = []
    for docid in dense_ids:
        idx = int(docid.split("_", 1)[1])
        # BM25 expects tokenized document and tokenized query; we already built bm25 on tokenized_for_bm25
        # Use bm25.get_scores to get score array for the query, then pick the doc's score
        scores_all = bm25.get_scores(query.split())
        bm25_scores.append(float(scores_all[idx]))
    # 3) normalize both score lists to [0,1]
    def normalize_list(arr):
        arr = np.array(arr, dtype=float)
        if arr.max() == arr.min():
            return np.ones_like(arr) * 0.5
        return (arr - arr.min()) / (arr.max() - arr.min())
    norm_dense = normalize_list(dense_scores)
    norm_bm25 = normalize_list(bm25_scores)
    fused = alpha * norm_dense + (1.0 - alpha) * norm_bm25
    # 4) choose top-k fused
    order = np.argsort(fused)[::-1][:k]
    hits = [dense_ids[i] for i in order]
    scores = [float(fused[i]) for i in order]
    latency = time.time() - t0
    return hits, scores, latency


Reranking function

In [None]:
def rerank_query(query: str, candidate_ids: List[str], top_n: int = None) -> Tuple[List[str], List[float], float]:
    """
    Re-rank candidates using cross-encoder if available.
    Returns sorted ids and corresponding reranker scores and latency.
    """
    t0 = time.time()
    if reranker is None:
        # fallback: return candidates unchanged with None scores
        return candidate_ids, [None]*len(candidate_ids), 0.0
    # build pairs
    pairs = []
    for docid in candidate_ids:
        idx = int(docid.split("_",1)[1])
        doc_text = texts[idx]
        pairs.append((query, doc_text))
    scores = reranker.predict(pairs)  # higher = better
    order = np.argsort(scores)[::-1]
    if top_n is None:
        top_n = len(candidate_ids)
    ordered_ids = [candidate_ids[i] for i in order[:top_n]]
    ordered_scores = [float(scores[i]) for i in order[:top_n]]
    latency = time.time() - t0
    return ordered_ids, ordered_scores, latency



Utility: fetch doc text & metadata by ids

In [None]:
def fetch_docs_by_ids(doc_ids: List[str]) -> List[dict]:
    docs = []
    for docid in doc_ids:
        idx = int(docid.split("_",1)[1])
        docs.append({
            "id": docid,
            "text": texts[idx],
            "metadata": metadatas[idx]
        })
    return docs

In [None]:
if reranker is not None:
    reranked_ids, reranked_scores, rerank_lat = rerank_query(sample_query, hy_ids)
    print("\nReranked (cross-encoder) top-5:")
    for i, rid in enumerate(reranked_ids):
        info = fetch_docs_by_ids([rid])[0]
        print(i+1, rid, f"{reranked_scores[i]:.4f}", info['metadata'], info['text'][:120].replace("\n"," "))


Reranked (cross-encoder) top-5:
1 doc_2 7.6176 {'file_name': 'akira_article_1001_Affordable_Lungi_Tops_and_Dresses_for_the_Avurudu_Season.pdf', 'chunk_id': 3, 'chunk_length': 500} like a floral hairpin or gold comb can also add an extra layer of elegance and sophistication to your look to style your
2 doc_1 6.0886 {'file_name': 'akira_article_1001_Affordable_Lungi_Tops_and_Dresses_for_the_Avurudu_Season.pdf', 'chunk_id': 2, 'chunk_length': 500} s enjoying family gathering or going to an evening celebration a wellstyled lungi dress will make you stand out in the c
3 doc_10 6.0064 {'file_name': 'akira_article_1003_Top_5_Fashion_Essentials_for_Sinhala_and_Tamil_New_Year.pdf', 'chunk_id': 3, 'chunk_length': 420} up or down making it a go to option for a variety of celebration pair a lungi dress with bold statement jewelry to give 
4 doc_12 4.9102 {'file_name': 'akira_article_1004_Traditional_Meets_Modern_Sinhala_and_Tamil_New_Year_Fashion.pdf', 'chunk_id': 2, 'chunk_length': 500} part of 

test run (sample query)

In [None]:
sample_query = "How to style a lungi dress for a festival"
print("Sample query:", sample_query)
h_ids, h_scores, h_lat = dense_retrieve(sample_query, k=5)
print("\nDense top-5 (id, score, metadata):")
for i, hid in enumerate(h_ids):
    info = fetch_docs_by_ids([hid])[0]
    print(i+1, hid, f"{h_scores[i]:.4f}", info['metadata'], info['text'][:150].replace("\n"," "))

b_ids, b_scores, b_lat = sparse_retrieve(sample_query, k=5)
print("\nBM25 top-5 (id, score, metadata):")
for i, bid in enumerate(b_ids):
    info = fetch_docs_by_ids([bid])[0]
    print(i+1, bid, f"{b_scores[i]:.4f}", info['metadata'], info['text'][:150].replace("\n"," "))

hy_ids, hy_scores, hy_lat = hybrid_retrieve(sample_query, k=5, alpha=0.6)
print("\nHybrid top-5 (id, fused_score, metadata):")
for i, hid in enumerate(hy_ids):
    info = fetch_docs_by_ids([hid])[0]
    print(i+1, hid, f"{hy_scores[i]:.4f}", info['metadata'], info['text'][:150].replace("\n"," "))

Sample query: How to style a lungi dress for a festival

Dense top-5 (id, score, metadata):
1 doc_1 0.5227 {'file_name': 'akira_article_1001_Affordable_Lungi_Tops_and_Dresses_for_the_Avurudu_Season.pdf', 'chunk_id': 2, 'chunk_length': 500} s enjoying family gathering or going to an evening celebration a wellstyled lungi dress will make you stand out in the crowd for a more traditional av
2 doc_12 0.5001 {'file_name': 'akira_article_1004_Traditional_Meets_Modern_Sinhala_and_Tamil_New_Year_Fashion.pdf', 'chunk_id': 2, 'chunk_length': 500} part of the celebration is often infused with modern touch to reflect the evolving fashion scene this year one of the most popular trend for lady is t
3 doc_10 0.4818 {'file_name': 'akira_article_1003_Top_5_Fashion_Essentials_for_Sinhala_and_Tamil_New_Year.pdf', 'chunk_id': 3, 'chunk_length': 420} up or down making it a go to option for a variety of celebration pair a lungi dress with bold statement jewelry to give it a festive flair or complete
4 doc_2

Save index metadata for evaluation later

In [None]:
import json
idx_meta_path = "/content/drive/MyDrive/HNDResearch/retrieval_index_meta.json"
meta = {
    "chroma_dir": DRIVE_CHROMA_DIR,
    "collection_name": collection.name, # Store the name, not the object
    "embed_model": EMBED_MODEL_NAME,
    "reranker": RERANKER_NAME if reranker is not None else None,
    "top_k": TOP_K
}
with open(idx_meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print("\nSaved retrieval metadata to:", idx_meta_path)



Saved retrieval metadata to: /content/drive/MyDrive/HNDResearch/retrieval_index_meta.json
