In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import lancedb
from sentence_transformers import SentenceTransformer
import pyarrow as pa
import pandas as pd

In [2]:
# Function to extract data from the PDF document
def load_pdf(data):
    loader = PyPDFLoader(data)
    documents = loader.load()
    return documents

In [3]:
# Apply the function on the pdf document
extracted_data = load_pdf('Introduction to Nutrition Science, LibreTexts Project.pdf')

In [4]:
# Function to split the extrracted pdf documents into chunks using LangChain's text splitter
def split_pdf(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [5]:
# length of chunks
chunks = split_pdf(extracted_data)
print("length of chunks:", len(chunks))

length of chunks: 3595


In [6]:
# page content of first page and last page
chunks[0].page_content, chunks[3594].page_content

('INTRODUCTION TO \nNUTRITION SCIENCE',
 '4.0\n16.3: The Food Industry - CC BY-NC-SA 4.0\n16.4: The Politics of Food - CC BY-NC-SA 4.0\n16.5: Food Cost and Inflation - CC BY-NC-SA 4.0\n16.6: The Issue of Food Security - CC BY-NC-SA 4.0\n16.7: Nutrition and Your Health - CC BY-NC-SA 4.0\n16.8: Diets around the World - CC BY-NC-SA 4.0\n16.E: Food Politics and Perspectives (Exercise) - CC\nBY-NC-SA 4.0\nBack Matter - CC BY-NC-SA 4.0\nIndex - CC BY-NC-SA 4.0\nGlossary - CC BY-NC-SA 4.0\nDetailed Licensing - CC BY-NC-SA 4.0')

In [7]:
# Function to generate embeddings using SentenceTransformer
def generate_embeddings(chunks, model_name: str = "all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    texts = [chunk.page_content for chunk in chunks]
    embeddings = model.encode(texts)
    return embeddings

In [8]:
# Generate embeddings for the chunks
embeddings = generate_embeddings(chunks)

In [9]:
len(embeddings)

3595

In [10]:
#chunks

In [11]:
#embeddings

#### Create a Lance database and insert the text and embeddings into it

In [17]:
import numpy as np

def prepare_embeddings(embeddings, fixed_size):
    prepared_embeddings = []
    for embedding in embeddings:
        if len(embedding) > fixed_size:
            # Truncate if longer
            prepared_embedding = embedding[:fixed_size]
        elif len(embedding) < fixed_size:
            # Pad if shorter
            prepared_embedding = embedding + [0.0] * (fixed_size - len(embedding))
        else:
            prepared_embedding = embedding
        prepared_embeddings.append(prepared_embedding)
    return prepared_embeddings

# Example fixed size (must match what LanceDB expects)
fixed_size = 384

# Prepare the embeddings
prepared_embeddings = prepare_embeddings(embeddings, fixed_size)

In [18]:
def load_chunks_into_lancedb(chunks, embeddings, db_path: str, table_name: str):
    # Connect to LanceDB
    db = lancedb.connect(db_path)
    
    # Define the schema using list_
    schema = pa.schema([
        ("chunk_id", pa.int32()),
        ("text", pa.string()),
        ("embedding", pa.list_(pa.float32()))  # Use list_
    ])
    
    # Create the table
    if table_name not in db.table_names():
        db.create_table(table_name, schema=schema)
    
    # Prepare data for insertion
    data = {
        "chunk_id": [],
        "text": [],
        "embedding": []
    }
    
    for i, (chunk, embedding) in enumerate(zip(chunks, prepared_embeddings)):
        data["chunk_id"].append(i)
        data["text"].append(chunk.page_content)
        data["embedding"].append(pa.array(embedding, type=pa.float32()))
    
    df = pd.DataFrame(data)

    # Insert into LanceDB
    table = db.open_table(table_name)

    print(f"Inserted {len(chunks)} chunks with embeddings into LanceDB table '{table_name}'.")

    return df, table


In [19]:
db_path = "lancedb"
table_name = "diet_data"

# Load the chunks and embeddings into LanceDB
df, table = load_chunks_into_lancedb(chunks, embeddings, db_path, table_name)

Inserted 3595 chunks with embeddings into LanceDB table 'diet_data'.


In [20]:
df.head()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"(-0.050998229533433914, -0.05659276247024536, ..."
1,1,Introduction to Nutrition Science,"(-0.050998229533433914, -0.05659276247024536, ..."
2,2,This text is disseminated via the Open Educati...,"(-0.01911933906376362, 0.10461527109146118, 0...."
3,3,Instructors can adopt existing LibreTexts text...,"(-0.029113683849573135, 0.010369493626058102, ..."
4,4,"for the construction, customization, and disse...","(-0.01710761897265911, 0.02413620799779892, -0..."


In [28]:
df['embedding']

0       (-0.050998229533433914, -0.05659276247024536, ...
1       (-0.050998229533433914, -0.05659276247024536, ...
2       (-0.01911933906376362, 0.10461527109146118, 0....
3       (-0.029113683849573135, 0.010369493626058102, ...
4       (-0.01710761897265911, 0.02413620799779892, -0...
                              ...                        
3590    (-0.024376945570111275, -0.005427005235105753,...
3591    (-0.02542775683104992, 0.023010224103927612, -...
3592    (-0.08159744739532471, 0.005391538608819246, -...
3593    (-0.023193251341581345, 0.054620761424303055, ...
3594    (0.007158744148910046, 0.029436510056257248, -...
Name: embedding, Length: 3595, dtype: object

#### Creating an index and perform semantic search query retrieval

In [18]:
import time

In [31]:
# Connect to LanceDB
db = lancedb.connect(db_path)

# Open the table
table = db.open_table(table_name)

# Create an index for the embedding column
table.create_index(vector_column_name='embedding', index_type="IVF_PQ")


In [33]:
db_path = "lancedb"
table_name = "diet_data"

# Load the chunks and embeddings into LanceDB
df, table = load_chunks_into_lancedb(chunks, embeddings, db_path, table_name)

Inserted 3595 chunks with embeddings into LanceDB table 'diet_data'.


In [None]:
def load_chunks_into_lancedb(chunks, embeddings, db_path: str, table_name: str):
    
    # Connecting to LanceDB
    db = lancedb.connect(db_path)
    
    # Define the schema using pyarrow
    schema = pa.schema([
        ("chunk_id", pa.int32()),
        ("text", pa.string()),
        ("embedding", pa.list_(pa.float32()))
    ])

    # Create a table for the lancedb
    if table_name not in db.table_names():
        db.create_table(table_name, schema=schema)
    
    # Create a DataFrame from chunks and embeddings
    data = {
        "chunk_id": [],
        "text": [],
        "embedding": []
    }
    
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        data["chunk_id"].append(i)
        data["text"].append(chunk.page_content)
        data["embedding"].append(embedding.tolist())  # Convert numpy array to list
    
    df = pd.DataFrame(data)

    print(db.table_names())
    db_table = (db.open_table(table_name)).to_pandas()
    table = db[table_name]
    
    print(f"Inserted {len(chunks)} chunks with embeddings into LanceDB table '{table_name}'.")

    return df, db_table, table

In [None]:
db_path = "lancedb"
table_name = "diet_data"

# Load the chunks and embeddings into LanceDB
df, table, db_table = load_chunks_into_lancedb(chunks, embeddings, db_path, table_name)

In [None]:
# saving the dataframe df
df.to_csv('diet-data.csv', index=False, escapechar='\\')