# Data Ingestion

## Libraries

In [None]:
import json
import os
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from dotenv import load_dotenv
from lancedb.embeddings import get_registry
from sentence_transformers import SentenceTransformer

load_dotenv()

## Functions

In [None]:
class HuggingFaceEmbedder:
    def __init__(self, model_name: str, api_key: str):
        self.model_name = model_name
        self.api_key = api_key

    def embed(self, texts: list[str]) -> list[list[float]]:
        api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/{self.model_name}"
        response = requests.post(
            url=api_url,
            headers={"Authorization": f"Bearer {self.api_key}"},
            json={
                "inputs": texts,
                "options": {"wait_for_model": True, "use_cache": True},
            },
        )
        return response.json()

# Parameters

In [None]:
# paths
data_path = Path(".").resolve().parent / "data"
data_path.is_dir()  # fails if it doesn't exist
blog_posts_root: Path = data_path / "blog_posts"
post_path_json: Path = blog_posts_root / "json"
post_path_json.is_dir()  # fails if it doesn't exist

# secrets
api_key = os.getenv("HF_TOKEN")

# Code

## Text Embedding



### Embedding Models
- Original Models : https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
> The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2` is 5 times faster and still offers good quality

`multi-qa-MiniLM-L6-cos-v1`  (80MB) : "tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs."

In [None]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
test_docs = ["Hello world"]

### Load Model locally

- big package: https://stackoverflow.com/questions/77205123/how-do-i-slim-down-sberts-sentencer-transformer-library

In [None]:
model = SentenceTransformer(model_name)

In [None]:
embeddings01 = model.encode(test_docs).tolist()
# embeddings01

### Load Model from HuggingFace API

In [None]:
embedder = HuggingFaceEmbedder(model_name=model_name, api_key=api_key)

In [None]:
embeddings02 = embedder.embed(test_docs)
# embeddings02

In [None]:
# compare embeddings
(np.array(embeddings01) / np.array(embeddings02)).mean()

### LanceDB Embedder

In [None]:
model_registry = get_registry().get("sentence-transformers")

In [None]:
model = model_registry.create(name=model_name)

In [None]:
model.ndims()

## Prepare data for ingestion

In [None]:
# Function to process a single JSON file

emb_model = SentenceTransformer(model_name)


def emb_funct(text: list[str]) -> list[list[float]]:
    return emb_model.encode(text).tolist()


def process_json_file(file_path: Path, emb_funct) -> pd.DataFrame:
    with open(file_path) as f:
        data: dict = json.load(f)

    # Extract the text data
    paragraphs: list[str] = data.get("paragraphs", [])
    key_takeaways: list[str] = data.get("key_takeaways", [])
    combined_text: list[str] = paragraphs + key_takeaways

    # Create embeddings for each text chunk
    embeddings: list[list[float]] = emb_funct(combined_text)

    # Prepare a DataFrame
    df = pd.DataFrame(
        {
            "url": [data.get("url")] * len(combined_text),
            "title": [data.get("title")] * len(combined_text),
            "text": combined_text,
            "embedding": embeddings,
            "blog_tags": [" ".join(data.get("blog_tags"))] * len(combined_text),
        }
    )

    return df

In [None]:
# Iterate over all JSON files and process them
files: list[Path] = list(post_path_json.glob("*.json"))

all_data = []

for json_file in files[:1]:
    df = process_json_file(file_path=json_file, emb_funct=emb_funct)
    all_data.append(df)

In [None]:
# with pd.option_context("display.max_colwidth", None):
#     display(df.iloc[[0]].style.set_properties(**{"text-align": "left"}))
df.iloc[[2]]