# Data Ingestion via LanceDB

## Libraries

In [12]:
import json
import os
import sys
import warnings
from pathlib import Path

import lancedb
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from lancedb.embeddings import get_registry
from lancedb.embeddings.sentence_transformers import SentenceTransformerEmbeddings
from lancedb.pydantic import LanceModel, Vector
from sentence_transformers import SentenceTransformer

# isort: off
sys.path.append("..")  # include repository-root to load modules from src folder
from src.constants import (  # noqa: E402
    LANCEDB_URI,
    POST_JSON_PATH,
    REPO_PATH,
    get_rag_config,
)
from src.embeddings import HuggingFaceEmbedder, EmbeddingFunction  # noqa: E402
from src.chunking import recursive_text_splitter  # noqa: E402

# ignore some warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

## Functions

# Parameters

In [14]:
# paths
POST_JSON_PATH.is_dir()  # fails if it doesn't exist
LANCEDB_URI.is_dir()  # fails if it doesn't exist


# Embeddings
emb_model_name: str = get_rag_config()["embeddings"]["model_name"]
device: str = get_rag_config()["hardware"]["device"]

# secrets
load_dotenv(REPO_PATH)
api_key = os.getenv("HF_TOKEN")

# Code

## Text Embedding



### Embedding Models
- Original Models : https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
> The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2` is 5 times faster and still offers good quality

`multi-qa-MiniLM-L6-cos-v1`  (80MB) : "tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs."

In [16]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
test_docs = ["Hello world"]

### Load Model locally

- big package: https://stackoverflow.com/questions/77205123/how-do-i-slim-down-sberts-sentencer-transformer-library

In [17]:
model = SentenceTransformer(model_name, device=device)

In [18]:
model.max_seq_length

512

In [None]:
embeddings01 = model.encode(test_docs).tolist()
# embeddings01

### Load Model from HuggingFace API

In [None]:
embedder = HuggingFaceEmbedder(model_name=model_name, api_key=api_key)

In [None]:
# slow since it's an API call
embeddings02 = embedder.embed(test_docs)
# embeddings02

In [None]:
# compare embeddings
(1 - np.array(embeddings01) / np.array(embeddings02)).mean()

### LanceDB Embedder

In [None]:
model_registry = get_registry().get("sentence-transformers")
model = model_registry.create(name=model_name)
model.ndims()

## Prepare data for ingestion

### Old: Function to process a single JSON file

In [None]:
# Function to process a single JSON file


def process_json_file(file_path: Path, emb_func: EmbeddingFunction) -> pd.DataFrame:
    with open(file_path) as f:
        data: dict = json.load(f)

    # Extract the text data
    paragraphs: list[str] = data.get("paragraphs", [])
    key_takeaways: list[str] = data.get("key_takeaways", [])
    combined_text: list[str] = paragraphs + key_takeaways

    # Create embeddings for each text chunk
    embeddings: list[list[float]] = emb_func(combined_text)

    # Prepare a DataFrame
    df = pd.DataFrame(
        {
            "url": [data.get("url")] * len(combined_text),
            "title": [data.get("title")] * len(combined_text),
            "text": combined_text,
            "embedding": embeddings,
            "blog_tags": [" ".join(set(data.get("blog_tags")))] * len(combined_text),
        }
    )

    return df

In [None]:
# using local model
emb_model = SentenceTransformer(model_name)


def emb_func(text: list[str]) -> list[list[float]]:
    return emb_model.encode(text).tolist()

In [None]:
# Iterate over all JSON files and process them
files: list[Path] = list(POST_JSON_PATH.glob("*.json"))

all_data = []

for json_file in files[:1]:
    df = process_json_file(file_path=json_file, emb_func=emb_func)
    all_data.append(df)

In [None]:
# with pd.option_context("display.max_colwidth", None):
#     display(df.iloc[[0]].style.set_properties(**{"text-align": "left"}))
df.iloc[[2]]

### clean pharagraphs

In [21]:
emb_model = SentenceTransformer(emb_model_name, device=device)

# Get the tokenizer from the model
tokenizer = emb_model.tokenizer

In [4]:
# Iterate over all JSON files and process them
files: list[Path] = list(POST_JSON_PATH.glob("*.json"))

In [5]:
files[0]

PosixPath('/home/alex/repos/rag_nutrition_facts_blog/notebooks/../data/blog_posts/json/eliminate-90-percent-heart-disease-risk.json')

In [25]:
def text_has_only_questions(text: str) -> bool:
    """
    Returns True if the input string contains question marks but not periods or exclamation marks.
    (aka text consists of only questions but no information)
    """
    return "?" in text and "." not in text and "!" not in text

In [8]:
json_file = files[0]
with open(json_file) as f:
    doc: dict = json.load(f)
paragraphs: list[str] = doc["paragraphs"]

SyntaxError: invalid syntax (3816603086.py, line 6)

In [23]:
[len(tokenizer.tokenize(para)) for para in paragraphs]

[131, 69, 109, 69, 84, 93, 59, 37, 28, 8]

In [11]:
[text_has_only_questions(para) for para in paragraphs]

[False, False, False, False, False, False, False, False, False, False]

In [27]:
[len(para) / len(tokenizer.tokenize(para)) for para in paragraphs]

[4.732824427480916,
 4.579710144927536,
 4.926605504587156,
 5.318840579710145,
 4.785714285714286,
 5.032258064516129,
 5.084745762711864,
 4.162162162162162,
 4.142857142857143,
 5.25]

In [43]:
n_token_max: int = get_rag_config()["embeddings"]["n_token_max"]
n_char_max: int = n_token_max * 4
overlap: int = int(n_char_max * 0.1)
paragraphs_new: list[str] = []
for i, para in enumerate(paragraphs):
    if text_has_only_questions(para):
        continue
    n_token: int = len(tokenizer.tokenize(para))
    if n_token > n_token_max:
        para_chunks: list[str] = recursive_text_splitter(para, n_char_max, overlap)
        print(f"{i}: {n_token} tokens: split needed. New chunks: {len(para_chunks)}")
        paragraphs_new.extend(para_chunks)
    else:
        paragraphs_new.append(para)

print(f"Original: {len(paragraphs)} -> New: {len(paragraphs_new)}")

0: 131 tokens: split needed. New chunks: 2
1: 69 tokens: split needed. New chunks: 1
2: 109 tokens: split needed. New chunks: 2
3: 69 tokens: split needed. New chunks: 1
4: 84 tokens: split needed. New chunks: 1
5: 93 tokens: split needed. New chunks: 2
6: 59 tokens: split needed. New chunks: 1
7: 37 tokens: split needed. New chunks: 1
8: 28 tokens: split needed. New chunks: 1
Original: 10 -> New: 13


In [44]:
paragraphs_new[0]

' myths and dogmas die hard. Researchers creating a new body of knowledge for prevention and control of heart disease had to disprove a bunch of doozies. For example, we used to think that heart disease, high cholesterol, and high blood pressure were just inevitable consequences of aging. All these are now bygone notions, refuted by massive data. Other long-standing myths and dogmas about our number one killer epidemic persist, however. For example, many still'

In [45]:
paragraphs_new[1]

' persist, however. For example, many still think that major risk factors, like cholesterol, account for a minority of risk, and that many people have heart attacks with no risk factors at all.'

In [None]:
a = "bla adfasb ? absada ?"
b = "bla adfasb ? absada ."


def check(q: str) -> bool:
    # check if q has only question marks but not periods nor exclamation marks
    return all(q.count(char) == 1 for char in "?!.")

## Test Ingestion

### Test data set to ingest

In [None]:
# file list of JSON files
files: list[Path] = list(POST_JSON_PATH.glob("*.json"))
print(f"{len(files)} JSON files are in: {POST_JSON_PATH}")

In [None]:
test_table_data: list[dict[str, str]] = []

for json_file in files[:2]:
    with open(json_file) as f:
        doc: dict = json.load(f)
    paragraphs: list[str] = doc["paragraphs"]
    title: str = doc["title"]
    url: str = doc["url"]
    blog_tags: str = " ".join(
        set(doc["blog_tags"])
    )  # remove duplicates and join with space
    test_table_data.extend(
        [
            {"text": para, "title": title, "url": url, "blog_tags": blog_tags}
            for para in paragraphs
        ]
    )

# print number of entries
print(f"{len(test_table_data)} entries")

### Method 01 : Simple: just text + vector

- following: 
    - https://lancedb.github.io/lancedb/embeddings/embedding_functions/
    - https://lancedb.github.io/lancedb/embeddings/available_embedding_models/text_embedding_functions/sentence_transformers/

In [None]:
# Define the embedding function
emb_model: SentenceTransformerEmbeddings = (
    get_registry().get("sentence-transformers").create(name=emb_model_name)
)
n_dim_vec = emb_model.ndims()


# Define the data model or schema
class DataModel(LanceModel):
    vector: Vector(dim=n_dim_vec) = emb_model.VectorField()
    text: str = emb_model.SourceField()
    title: str
    url: str
    blog_tags: str

In [None]:
# create/connect to the database
db: lancedb.db.DBConnection = lancedb.connect(uri=LANCEDB_URI)

# create table via schema, which creates embeddings for the text column stored in the vector column
table: lancedb.table.Table = db.create_table(
    "table01", schema=DataModel, mode="overwrite"
)

# add data to the table, which creates embeddings for the text column stored in the vector column
table.add(data=test_table_data)

#### testing table content

In [None]:
# test input
table.head(2)

In [None]:
# test simple trivial query
query = "How to reduce Heart Disease Risk"
response: list[DataModel] = table.search(query).limit(5).to_pydantic(DataModel)

# unique URLs
urls: set = {actual.url for actual in response}
print(f"{len(urls)} unique URL(s)")

# unique Titles
titles: set = {actual.title for actual in response}
print(f"{len(titles)} unique Title(s)")

print(f"{len(response)} results for: '{query}'")
for i, actual in enumerate(response):
    print(f"\t{i}. {actual.text}")

In [None]:
resp = table.search(query).to_pydantic(DataModel)
print(f"{len(response)} results for: {query}")
resp