In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import os
import json
import pandas as pd
import time
from pinecone import Pinecone
from openai import OpenAI
# from langchain.vectorstores import Pinecone
from pinecone import ServerlessSpec
from IPython.display import Markdown

  from tqdm.autonotebook import tqdm


## 01 Get Data

In [3]:
# configure
base_dir = "data/"
filename = 'all_recipes_one.json'

In [10]:
file_path = f'{base_dir}{filename}'

with open(file_path, 'r') as f:
    json_data = json.load(f)

data = pd.json_normalize(json_data)

# Convert lists to strings
data['ingredients'] = data['ingredients'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
data['instructions'] = data['instructions'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Define desired column order (excluding ingredients and instructions)
desired_order = [col for col in data.columns if col not in ["ingredients", "instructions"]]

# Reorder columns
data = data[desired_order + ["ingredients", "instructions"]]

header = ["link", "title", "prep_time", "cook_time", "difficulty", "serves", "description", "diet_type", "kcal", "fat", "saturates", "carbs", "sugars", "fibre", "protein",	"salt",	"ingredients", "instructions"]
data.columns = header

data.head()

Unnamed: 0,link,title,prep_time,cook_time,difficulty,serves,description,diet_type,kcal,fat,saturates,carbs,sugars,fibre,protein,salt,ingredients,instructions
0,https://www.bbcgoodfood.com/recipes/bloody-mar...,Bloody mary mussels,30 mins,20 mins,Easy,Serves 4,The mussels get a little kick from the vodka i...,Low fat,228,6g,1g,6g,5g,2g,26g,3.2g,"1 tbspolive oil, 2celery sticks,finely chopped...","Heat the oil in a large, deepsaucepanover a me..."
1,https://www.bbcgoodfood.com/recipes/spelt-panc...,Spelt pancakes with rhubarb,15 mins,50 mins,Easy,Serves 4,Make some spelt pancakes and finish with a vib...,Vegetarian,826,61g,31g,48g,18g,4g,18g,0.8g,"400grhubarb,cut into 2-3cm pieces, 60ghoney,pl...",Tip the pancake ingredients into ablenderand b...
2,https://www.bbcgoodfood.com/recipes/purple-spr...,Purple sprouting broccoli with vinaigrette,10 mins,5 mins,Easy,Serves 4 as a side,"Make the most of purple sprouting broccoli, wi...",Gluten-free,126,9g,1g,5g,4g,3g,4g,0.7g,350gpurple sprouting broccoli(or use other bro...,Bring a large pan of salted water to the boil ...
3,https://www.bbcgoodfood.com/recipes/lime-merin...,Lime meringue pie,30 mins,1 hr and 30 mins,More effort,Serves 8 - 10,Rustle up an indulgent meringue-topped pie for...,Vegetarian,432,22g,131g,54g,35g,1g,5g,0.5g,"2 tbspcornflour, 125ggolden caster sugar, 6lim...","For the pastry, put the flour, sugar and a pin..."
4,https://www.bbcgoodfood.com/recipes/rhubarb-fool,Easy rhubarb fool,25 mins,45 mins,Easy,Serves 4 (with extra biscuits),Celebrate the gorgeous colour of forced rhubar...,Freezable (unbaked dough only),897,55g,32g,88g,68g,4g,10g,0.2g,"400gforced rhubarb,cut into 1cm chunks, 150ggo...","Tip the rhubarb, sugar, orange zest and juice ..."


## 02 Configure Pinecone

In [6]:
api_key = os.environ.get("PINECONE_API_KEY")

In [7]:
# configure client
pc = Pinecone(api_key=api_key)

## 03 Create Pinecone index

In [9]:
index_name = 'recipes-index'
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# We create a new index with a dimension size of 1536 (for text-embedding-ada-002)
pc.create_index(
    index_name,
    dimension=1536,  # dimensionality of text-embedding-ada-002
    metric='cosine',
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

# Wait for the index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## 05 Create Embeddings

In [10]:
texts = (
    data['title'] + ' ' +
    data['description'] + ' ' +
    data['ingredients'] + ' ' +
    data['prep_time'] + ' ' +
    data['cook_time'] + ' ' +
    data['kcal'].astype(str) + ' kcal ' +
    data['fat'].astype(str) + ' fat ' +
    data['saturates'].astype(str) + ' saturates ' +
    data['carbs'].astype(str) + ' carbs ' +
    data['sugars'].astype(str) + ' sugars ' +
    data['fibre'].astype(str) + ' fibre ' +
    data['protein'].astype(str) + ' protein ' +
    data['salt'].astype(str) + ' salt'
)
texts_list = texts.tolist()


## 06 Open AI Key

In [27]:
openai.api_key = os.getenv('OPENAI_API_KEY') or 'YOUR_OPENAI_API_KEY'

## 07 Create Open Ai embeddings for data

In [15]:
embed_model = "text-embedding-3-small"
embeddings = []
for text in texts_list:
    response = openai.embeddings.create(input=[text], model=embed_model)
    embeddings.append(response.data[0].embedding)


In [42]:
embeddings[:5]

[[0.005961424671113491,
  0.053051527589559555,
  -0.006385195534676313,
  -0.005566286854445934,
  -0.05016530305147171,
  -0.02133743278682232,
  0.012575685046613216,
  0.0009928549407050014,
  0.02089075557887554,
  0.00011694857676047832,
  0.046156659722328186,
  -0.025861473754048347,
  0.026296697556972504,
  0.01412187609821558,
  0.019012421369552612,
  0.04601922258734703,
  -0.015278656035661697,
  0.019298752769827843,
  -0.06601662188768387,
  -0.00358773535117507,
  0.03857459872961044,
  -0.030167901888489723,
  0.0025411932729184628,
  0.020650237798690796,
  -0.004160398617386818,
  0.031702637672424316,
  0.015416095033288002,
  -0.0045440830290317535,
  -0.016389623284339905,
  0.017260070890188217,
  0.00987844169139862,
  -0.01242679264396429,
  0.028587350621819496,
  0.014156236313283443,
  -0.01039383839815855,
  -0.002357941120862961,
  0.017454776912927628,
  -0.007352996617555618,
  0.006906318943947554,
  -0.02975558303296566,
  -0.04111722484230995,
  -0.0

## 08 Upsert the Data to Pinecone

In [16]:
# Step 5: Upsert the Data to Pinecone
for i, embedding_vector in enumerate(embeddings):
    metadata = {
        "id": str(i),
        "title": data.loc[i, 'title'],
        "description": data.loc[i, 'description'],
        "prep_time": data.loc[i, 'prep_time'],
        "cook_time": data.loc[i, 'cook_time'],
        "difficulty": data.loc[i, 'difficulty'],
        "serves": data.loc[i, 'serves'],
        "diet_type": data.loc[i, 'diet_type'],
        "ingredients": data.loc[i, 'ingredients'],
        "nutrition": {
            "kcal": data.loc[i, 'kcal'],
            "fat": data.loc[i, 'fat'],
            "saturates": data.loc[i, 'saturates'],
            "carbs": data.loc[i, 'carbs'],
            "sugars": data.loc[i, 'sugars'],
            "fibre": data.loc[i, 'fibre'],
            "protein": data.loc[i, 'protein'],
            "salt": data.loc[i, 'salt'],
        },
        "instructions": data.loc[i, 'instructions'],
    }
    index.upsert(vectors=[(str(i), embedding_vector, metadata)])

print(f"Uploaded {len(embeddings)} recipes to Pinecone.")

Uploaded 1118 recipes to Pinecone.


## 09 Query Pinecone with a new recipe query

In [44]:
query = "chicken and spaghetti"
res = openai.embeddings.create(input=[query], model=embed_model)
xq = res.data[0].embedding

# Retrieve relevant contexts from Pinecone
query_res = index.query(vector=xq, top_k=5, include_metadata=True)

# Extract the matched metadata (recipes) from Pinecone results
contexts = [item['metadata'] for item in query_res['matches']]

In [45]:
context_texts = [f"Title: {c['title']}\nDescription: {c['description']}\nIngredients: {c['ingredients']}" for c in contexts]
augmented_query = "\n\n---\n\n".join(context_texts) + "\n\n-----\n\n" + query
augmented_query

"Title: Creamy Tuscan chicken\nDescription: Enjoy our quick midweek Tuscan chicken dinner, which makes the most of the colour and flavour of spinach. Serve with spring greens or orzo\nIngredients: 2 tbspplain flour, 1 tbspsmoked paprika, 2 tsporegano, 4skin-on chicken breasts, 80gsundried tomatoes,chopped, plus 1 tbsp oil from the jar (or use olive oil), 4garlic cloves,finely grated or crushed, 125mlwhite wine, 250mlchicken stock, 200gspinach, 150mldouble cream, 75ggrated parmesan, cooked rice,to serve, small handful of parsley,chopped\n\n---\n\nTitle: Creamy spinach, basil & chicken pasta\nDescription: Roast a chicken, then transform it into this speedy weeknight pasta dish with the help of spinach, basil, garlic and crème fraîche\nIngredients: 300glong pasta(we used mafaldine), 2 tbspleftover chicken fat(from the roast chicken recipe, below), 1onion, finely chopped, 1red chilli, finely chopped, or 1 tsp chilli flakes, 4leftover roasted garlic cloves, mashed, 2 tbsptomato purée, 200ml

In [46]:
primer = """
You are a highly intelligent system that answers user questions based on the information provided. If the answer cannot be found in the context provided, say "I don't know".
"""

In [47]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


res = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)


In [48]:
res

ChatCompletion(id='chatcmpl-ADlAo0URWwxAIKP6WALX5ffO2ADI7', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I don't know.", role='assistant', function_call=None, tool_calls=None, refusal=None))], created=1727842962, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_f85bea6784', usage=CompletionUsage(completion_tokens=4, prompt_tokens=831, total_tokens=835, prompt_tokens_details={'cached_tokens': 0}, completion_tokens_details={'reasoning_tokens': 0}))

In [49]:
response_content = res.choices[0].message.content
display(Markdown(response_content))

I don't know.

In [50]:
# Step 9: Compare Response with Non-Augmented Query
res_non_augmented = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": query}
    ]
)



In [51]:
response_content = res.choices[0].message.content
display(Markdown(response_content))

I don't know.