In [2]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [3]:
import os
import json
import pandas as pd
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from IPython.display import Markdown

  from tqdm.autonotebook import tqdm


## 01 Get Data

In [3]:
# configure
base_dir = "data/"
filename = 'all_recipes_one.json'

In [4]:
file_path = f'{base_dir}{filename}'

with open(file_path, 'r') as f:
    json_data = json.load(f)

data = pd.json_normalize(json_data)

# Convert lists to strings
data['ingredients'] = data['ingredients'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
data['instructions'] = data['instructions'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Define desired column order (excluding ingredients and instructions)
desired_order = [col for col in data.columns if col not in ["ingredients", "instructions"]]

# Reorder columns
data = data[desired_order + ["ingredients", "instructions"]]

header = ["link", "title", "prep_time", "cook_time", "difficulty", "serves", "description", "diet_type", "kcal", "fat", "saturates", "carbs", "sugars", "fibre", "protein",	"salt",	"ingredients", "instructions"]
data.columns = header

data.head()

Unnamed: 0,link,title,prep_time,cook_time,difficulty,serves,description,diet_type,kcal,fat,saturates,carbs,sugars,fibre,protein,salt,ingredients,instructions
0,https://www.bbcgoodfood.com/recipes/bloody-mar...,Bloody mary mussels,30 mins,20 mins,Easy,Serves 4,The mussels get a little kick from the vodka i...,Low fat,228,6g,1g,6g,5g,2g,26g,3.2g,"1 tbspolive oil, 2celery sticks,finely chopped...","Heat the oil in a large, deepsaucepanover a me..."
1,https://www.bbcgoodfood.com/recipes/spelt-panc...,Spelt pancakes with rhubarb,15 mins,50 mins,Easy,Serves 4,Make some spelt pancakes and finish with a vib...,Vegetarian,826,61g,31g,48g,18g,4g,18g,0.8g,"400grhubarb,cut into 2-3cm pieces, 60ghoney,pl...",Tip the pancake ingredients into ablenderand b...
2,https://www.bbcgoodfood.com/recipes/purple-spr...,Purple sprouting broccoli with vinaigrette,10 mins,5 mins,Easy,Serves 4 as a side,"Make the most of purple sprouting broccoli, wi...",Gluten-free,126,9g,1g,5g,4g,3g,4g,0.7g,350gpurple sprouting broccoli(or use other bro...,Bring a large pan of salted water to the boil ...
3,https://www.bbcgoodfood.com/recipes/lime-merin...,Lime meringue pie,30 mins,1 hr and 30 mins,More effort,Serves 8 - 10,Rustle up an indulgent meringue-topped pie for...,Vegetarian,432,22g,131g,54g,35g,1g,5g,0.5g,"2 tbspcornflour, 125ggolden caster sugar, 6lim...","For the pastry, put the flour, sugar and a pin..."
4,https://www.bbcgoodfood.com/recipes/rhubarb-fool,Easy rhubarb fool,25 mins,45 mins,Easy,Serves 4 (with extra biscuits),Celebrate the gorgeous colour of forced rhubar...,Freezable (unbaked dough only),897,55g,32g,88g,68g,4g,10g,0.2g,"400gforced rhubarb,cut into 1cm chunks, 150ggo...","Tip the rhubarb, sugar, orange zest and juice ..."


In [5]:
data.shape

(1118, 18)

In [6]:
data.isna().sum()

link             0
title            0
prep_time        0
cook_time        0
difficulty       0
serves           0
description      0
diet_type        0
kcal             6
fat             11
saturates       11
carbs            6
sugars          11
fibre           11
protein         11
salt            12
ingredients      0
instructions     0
dtype: int64

## 02 Clean Empty Columns

In [7]:
data = data.dropna().reset_index(drop=True)

In [8]:
data.shape

(1106, 18)

In [9]:
data.isna().sum()

link            0
title           0
prep_time       0
cook_time       0
difficulty      0
serves          0
description     0
diet_type       0
kcal            0
fat             0
saturates       0
carbs           0
sugars          0
fibre           0
protein         0
salt            0
ingredients     0
instructions    0
dtype: int64

## 03 Configure Pinecone

In [4]:
api_key = os.environ.get("PINECONE_API_KEY")

In [5]:
# configure client
pc = Pinecone(api_key=api_key)

## 04 Create Pinecone index

In [12]:
index_name = 'recipes-index'
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# We create a new index with a dimension size of 1536 (for text-embedding-ada-002)
pc.create_index(
    index_name,
    dimension=1536,  # dimensionality of text-embedding-ada-002
    metric='cosine',
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

# Wait for the index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## 05 Data to embed to include in Pinecone

In [13]:
texts = (
    'Title: ' + data['title'] + ', ' +
    'Description: ' + data['description'] + ', ' +
    'Ingredients: ' + data['ingredients'] + ', ' +
    'Prep time: ' + data['prep_time'] + ', ' +
    'Cook time: ' + data['cook_time'] + ', ' +
    'Calories: ' + data['kcal'].astype(str) + ' kcal, ' +
    'Fat: ' + data['fat'].astype(str) + ' g, ' +
    'Saturates: ' + data['saturates'].astype(str) + ' g, ' +
    'Carbs: ' + data['carbs'].astype(str) + ' g, ' +
    'Sugars: ' + data['sugars'].astype(str) + ' g, ' +
    'Fibre: ' + data['fibre'].astype(str) + ' g, ' +
    'Protein: ' + data['protein'].astype(str) + ' g, ' +
    'Salt: ' + data['salt'].astype(str) + ' g'
)

texts_list = texts.tolist()


## 06 Open AI Key

In [6]:
openai_api_key = os.getenv('OPENAI_API_KEY')

## 07 Embedding Model

In [7]:
embed_model = "text-embedding-3-small"
embeddings_model = OpenAIEmbeddings(model=embed_model)

## 08 Embedding data

In [15]:

embeddings = []
for text in texts_list:
  response = embeddings_model.embed_query(text)
  embeddings.append(response)


In [17]:
embeddings[:5]
print(len(embeddings))

1106


## 09 Upsert the Data to Pinecone

In [18]:
# Step 5: Upsert the Data to Pinecone
for i, embedding_vector in enumerate(embeddings):
    metadata = {
        "id": str(i),
        "title": data.loc[i, 'title'],
        "description": data.loc[i, 'description'],
        "prep_time": data.loc[i, 'prep_time'],
        "cook_time": data.loc[i, 'cook_time'],
        "difficulty": data.loc[i, 'difficulty'],
        "serves": data.loc[i, 'serves'],
        "diet_type": data.loc[i, 'diet_type'],
        "ingredients": data.loc[i, 'ingredients'],
        "calories": data.loc[i, 'kcal'],
        "fat": data.loc[i, 'fat'],
        "saturates": data.loc[i, 'saturates'],
        "carbs": data.loc[i, 'carbs'],
        "sugars": data.loc[i, 'sugars'],
        "fibre": data.loc[i, 'fibre'],
        "protein": data.loc[i, 'protein'],
        "salt": data.loc[i, 'salt'],
        "instructions": data.loc[i, 'instructions'],
    }
    index.upsert(vectors=[(str(i), embedding_vector, metadata)])

print(f"Uploaded {len(embeddings)} recipes to Pinecone.")

Uploaded 1106 recipes to Pinecone.
