# Document Data, Chunks and Emmbeddings

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
base_dir = "data/md_files"
markdown_path_test = "data/recipes_batch_1_test.md"

## Document Loaders

In [3]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

loader = UnstructuredMarkdownLoader(markdown_path_test)

data = loader.load()
assert isinstance(data[0], Document)

In [4]:
print(data[0].page_content[:250])

Medal cookies

Link: https://www.bbcgoodfood.com/recipes/medal-cookies

Description: Make these round vanilla biscuits, ice in bright colours, thread on a ribbon and give out as edible prizes

Prep Time: 35 mins

Cook Time: 15 mins

Difficulty: Easy



In [5]:
data[0].metadata

{'source': 'data/recipes_batch_1_test.md'}

## Document Splitting with Langchain

In [6]:
# from langchain.text_splitter import MarkdownHeaderTextSplitter

In [7]:
# markdown_text = ' '.join([d.page_content for d in docs])
# print(f"Length of concatenated markdown text: {len(markdown_text)}")

In [8]:
# len(markdown_text)

In [9]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1100,
    chunk_overlap=150,
    length_function=len
)

In [10]:
chunks = text_splitter.split_documents(data)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 23


In [11]:
chunks[:5]

[Document(metadata={'source': 'data/recipes_batch_1_test.md'}, page_content='Medal cookies\n\nLink: https://www.bbcgoodfood.com/recipes/medal-cookies\n\nDescription: Make these round vanilla biscuits, ice in bright colours, thread on a ribbon and give out as edible prizes\n\nPrep Time: 35 mins\n\nCook Time: 15 mins\n\nDifficulty: Easy\n\nServes: Makes 15 cookies\n\nDiet Type: None\n\nNutrition Information\n\nkcal: 418\n\nfat: 8g\n\nsaturates: 5g\n\ncarbs: 82g\n\nsugars: 66g\n\nfibre: 1g\n\nprotein: 3g\n\nsalt: 0.4g\n\nIngredients\n\n140gbutter\n\n100glight soft brown sugar\n\n3 tbspgolden syrup\n\n½ tspvanilla extract\n\n350gplain flour, plus extra for dusting\n\n1 tspbicarbonate of soda\n\n1large egg\n\n1kgpack ready-to-roll icing\n\nicing sugar\n\nfood colouringpaste, edible glitter and icing pens (optional)\n\n15long colourful ribbons\n\nTo decorate\n\n1kgpack ready-to-roll icing\n\nicing sugar\n\nfood colouringpaste, edible glitter and icing pens (optional)\n\n15long colourful ribb

## Embeddings

In [12]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [13]:
# chunks_content = [chunk.page_content for chunk in chunks]

In [14]:
# embeddings = embeddings_model.embed_documents(chunks_content)

In [15]:
from langchain.vectorstores import Chroma
import sqlite3

In [16]:
persist_directory = 'data/chroma/'

In [17]:
!rm -rf ./docs/chroma  # remove old database files if any

In [18]:
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings_model,
    persist_directory=persist_directory
)

RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma                     requires sqlite3 >= 3.35.0.[0m
[94mPlease visit                     https://docs.trychroma.com/troubleshooting#sqlite to learn how                     to upgrade.[0m

In [None]:
vectordb.persist()

In [None]:
print(vectordb._collection.count())

## Similarity Search

In [None]:
question = "I need a with vanilla"

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
len(docs)

In [None]:
docs[0].page_content