# Data Ingestion

## Libraries

In [None]:
import json
import os
import sys
from pathlib import Path
from typing import Protocol

import lancedb
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from lancedb.embeddings import get_registry
from lancedb.embeddings.sentence_transformers import SentenceTransformerEmbeddings
from lancedb.pydantic import LanceModel, Vector
from sentence_transformers import SentenceTransformer

# isort: off
sys.path.append("..")  # include repository-root to load modules from src folder
from src.constants import LANCEDB_URI, post_path_json  # noqa: E402
from src.embeddings import HuggingFaceEmbedder  # noqa: E402

## Typing

In [None]:
class EmbeddingFunction(Protocol):
    """
    A protocol that represents a function for generating embeddings.

    Parameters
    ----------
    text : List[str]
        A list of strings for which embeddings are to be generated.

    Returns
    -------
    List[List[float]]
        A list of embeddings, where each embedding is represented as
        a list of floats.
    """

    def __call__(self, text: list[str]) -> list[list[float]]: ...

## Functions

In [None]:
# empty

# Parameters

In [None]:
# paths
post_path_json.is_dir()  # fails if it doesn't exist
LANCEDB_URI.is_dir()  # fails if it doesn't exist


# Embeddings
# - https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
emb_model_name = (
    "multi-qa-MiniLM-L6-cos-v1"  # a pre-trained model of `sentence-transformers`
)

# secrets
load_dotenv()
api_key = os.getenv("HF_TOKEN")

# Code

## Text Embedding



### Embedding Models
- Original Models : https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
> The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2` is 5 times faster and still offers good quality

`multi-qa-MiniLM-L6-cos-v1`  (80MB) : "tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs."

In [None]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
test_docs = ["Hello world"]

### Load Model locally

- big package: https://stackoverflow.com/questions/77205123/how-do-i-slim-down-sberts-sentencer-transformer-library

In [None]:
model = SentenceTransformer(model_name)

In [None]:
embeddings01 = model.encode(test_docs).tolist()
# embeddings01

### Load Model from HuggingFace API

In [None]:
embedder = HuggingFaceEmbedder(model_name=model_name, api_key=api_key)

In [None]:
# slow since it's an API call
embeddings02 = embedder.embed(test_docs)
# embeddings02

In [None]:
# compare embeddings
(1 - np.array(embeddings01) / np.array(embeddings02)).mean()

### LanceDB Embedder

In [None]:
model_registry = get_registry().get("sentence-transformers")
model = model_registry.create(name=model_name)
model.ndims()

## Prepare data for ingestion (ignored)

In [None]:
# Function to process a single JSON file


def process_json_file(file_path: Path, emb_func: EmbeddingFunction) -> pd.DataFrame:
    with open(file_path) as f:
        data: dict = json.load(f)

    # Extract the text data
    paragraphs: list[str] = data.get("paragraphs", [])
    key_takeaways: list[str] = data.get("key_takeaways", [])
    combined_text: list[str] = paragraphs + key_takeaways

    # Create embeddings for each text chunk
    embeddings: list[list[float]] = emb_func(combined_text)

    # Prepare a DataFrame
    df = pd.DataFrame(
        {
            "url": [data.get("url")] * len(combined_text),
            "title": [data.get("title")] * len(combined_text),
            "text": combined_text,
            "embedding": embeddings,
            "blog_tags": [" ".join(set(data.get("blog_tags")))] * len(combined_text),
        }
    )

    return df

In [None]:
# using local model
emb_model = SentenceTransformer(model_name)


def emb_func(text: list[str]) -> list[list[float]]:
    return emb_model.encode(text).tolist()

In [None]:
# Iterate over all JSON files and process them
files: list[Path] = list(post_path_json.glob("*.json"))

all_data = []

for json_file in files[:1]:
    df = process_json_file(file_path=json_file, emb_func=emb_func)
    all_data.append(df)

In [None]:
# with pd.option_context("display.max_colwidth", None):
#     display(df.iloc[[0]].style.set_properties(**{"text-align": "left"}))
df.iloc[[2]]

## Ingestion

### Test data set to ingest

In [None]:
# file list of JSON files
files: list[Path] = list(post_path_json.glob("*.json"))
print(f"{len(files)} JSON files are in: {post_path_json}")

In [None]:
test_table_data: list[dict[str, str]] = []

for json_file in files[:2]:
    with open(json_file) as f:
        doc: dict = json.load(f)
    paragraphs: list[str] = doc["paragraphs"]
    title: str = doc["title"]
    url: str = doc["url"]
    blog_tags: str = " ".join(
        set(doc["blog_tags"])
    )  # remove duplicates and join with space
    test_table_data.extend(
        [
            {"text": para, "title": title, "url": url, "blog_tags": blog_tags}
            for para in paragraphs
        ]
    )

# print number of entries
print(f"{len(test_table_data)} entries")

### Method 01 : Simple: just text + vector

- following: 
    - https://lancedb.github.io/lancedb/embeddings/embedding_functions/
    - https://lancedb.github.io/lancedb/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers/

In [None]:
# Define the embedding function
emb_model: SentenceTransformerEmbeddings = (
    get_registry().get("sentence-transformers").create(name=emb_model_name)
)
n_dim_vec = emb_model.ndims()


# Define the data model or schema
class DataModel01(LanceModel):
    vector: Vector(dim=n_dim_vec) = emb_model.VectorField()
    text: str = emb_model.SourceField()
    title: str
    url: str
    blog_tags: str

In [None]:
# create/connect to the database
db: lancedb.db.DBConnection = lancedb.connect(uri=LANCEDB_URI)

# create table via schema, which creates embeddings for the text column stored in the vector column
table01: lancedb.table.Table = db.create_table(
    "table01", schema=DataModel01, mode="overwrite"
)

# add data to the table, which creates embeddings for the text column stored in the vector column
table01.add(data=test_table_data)

#### testing table content

In [None]:
# test input
table01.head(2)

In [None]:
# test simple trivial query
query = "How to reduce Heart Disease Risk"
response: list[DataModel01] = table01.search(query).limit(5).to_pydantic(DataModel01)

# unique URLs
urls: set = {actual.url for actual in response}
print(f"{len(urls)} unique URL(s)")

# unique Titles
titles: set = {actual.title for actual in response}
print(f"{len(titles)} unique Title(s)")

print(f"{len(response)} results for: '{query}'")
for i, actual in enumerate(response):
    print(f"\t{i}. {actual.text}")

In [None]:
resp = table01.search(query).to_pydantic(DataModel01)
print(f"{len(response)} results for: {query}")
resp