In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
from llama_index.core import Document
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
    MarkdownNodeParser,
)
from llama_index.embeddings.openai import OpenAIEmbedding

In [3]:
# configure
base_dir = "data/"
filename = 'all_recipes_one.json'

In [4]:
def print_nodes(nodes):
    for ix, node in enumerate(nodes):
        print(f"\n\n>>> Node {ix} length={len(node.text)}\n")
        print(f">>>{ix} {node.id_}")
        print("Metadata", node.metadata)
        print("Text", node.text[:200])
        print("\n\n")


## Load Document

In [5]:
import json
with open(f'{base_dir}{filename}', 'r', encoding='utf-8') as file:
    document = Document(
        text = file.read(),
        metadata = {"filename": filename},
    )
print(len(document.text))

2495526


In [6]:
cutoff = 2100
print(document.text[:cutoff])

[
  {
    "general_info": {
      "link": "https://www.bbcgoodfood.com/recipes/bloody-mary-mussels",
      "title": "Bloody mary mussels",
      "prep_time": "30 mins",
      "cook_time": "20 mins",
      "difficulty": "Easy",
      "serves": "Serves 4",
      "description": "The mussels get a little kick from the vodka in this bloody mary mussels recipe. Serve with garlic bread or fries to mop up the sauce",
      "diet_type": "Low fat"
    },
    "nutrition_info": {
      "kcal": "228",
      "fat": "6g",
      "saturates": "1g",
      "carbs": "6g",
      "sugars": "5g",
      "fibre": "2g",
      "protein": "26g",
      "salt": "3.2g"
    },
    "ingredients": [
      "1 tbspolive oil",
      "2celery sticks,finely chopped",
      "1red chilli,finely chopped",
      "1onion,finely chopped",
      "1lemon,zest peeled into strips, juiced",
      "75mlvodka",
      "250mltomato juice",
      "5gdashi powder(see tip, below)",
      "½ tspcelery salt",
      "2 tspWorcestershire sauce",

## Sentence Splitter

In [7]:
chunk_size = 2050
chunk_overlap = 50

In [8]:
from llama_index.core.node_parser import JSONNodeParser

parser = JSONNodeParser()

nodes = parser.get_nodes_from_documents([document], show_progress=False)
print(len(nodes))

1118


In [9]:
print_nodes(nodes[:2])



>>> Node 0 length=2032

>>>0 f2ee1305-fe90-47cd-8f55-b1de5401cab1
Metadata {'filename': 'all_recipes_one.json'}
Text general_info link https://www.bbcgoodfood.com/recipes/bloody-mary-mussels
general_info title Bloody mary mussels
general_info prep_time 30 mins
general_info cook_time 20 mins
general_info difficulty E





>>> Node 1 length=2801

>>>1 11c389a1-98b6-4bf9-9a01-08acd3cab484
Metadata {'filename': 'all_recipes_one.json'}
Text general_info link https://www.bbcgoodfood.com/recipes/spelt-pancakes-with-rhubarb
general_info title Spelt pancakes with rhubarb
general_info prep_time 15 mins
general_info cook_time 50 mins
general_i





In [10]:
print(type(nodes))

<class 'list'>


## Embeddings

In [18]:
import re
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

In [20]:
load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

pc = Pinecone(api_key=api_key)

## Creating a Pinecone Index

In [33]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
from llama_index.core import StorageContext

In [None]:
pc.delete_index("recipes")

In [None]:
# dimensions are for text-embedding-ada-002

pc.create_index(
    name="recipes",
    dimension=1536,
    metric="euclidean",
    spec=ServerlessSpec(cloud="aws", region="us-west-2"),
)

In [None]:
pinecone_index = pc.Index("recipes")

In [30]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    nodes, storage_context=storage_context
)

AttributeError: 'TextNode' object has no attribute 'get_doc_id'

In [None]:
# create client and a new collection
chroma_client = chromadb.PersistentClient(path="./data/chroma")
chroma_collection = chroma_client.create_collection("recipes")

# define embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes, storage_context=storage_context)

In [None]:
vector_store_info = VectorStoreInfo(
    content_info="How to play the board game Sleeping Gods",
    metadata_info=[
        MetadataInfo(
            name="Header_2",
            type="str",
            description=(
                "Board game manual topic, one of [Basics, Challenges, Combat, Icons, Other Rules, Overview, Player Turn, Setup, Spending Command, Turn Overview]"
            ),
        ),
    ],
)
retriever = VectorIndexAutoRetriever(
    index, vector_store_info=vector_store_info
)

## Pass data to pandas Dataframe

In [None]:
import pandas as pd

In [None]:
file_path = f'{base_dir}{filename}'

with open(file_path, 'r') as f:
    json_data = json.load(f)

recipes_df = pd.json_normalize(json_data)

# Define desired column order (excluding ingredients and instructions)
desired_order = [col for col in recipes_df.columns if col not in ["ingredients", "instructions"]]

# Reorder columns
recipes_df = recipes_df[desired_order + ["ingredients", "instructions"]]

df.head(10)