## Retrieving and Indexing text into Pinecone (external_knowledge_Recipe.csv)

### Option 1: use gemini ai - GoogleGenerativeAIEmbeddings for embedding model (free model from gemini: models/embedding-001) - SUCCESS

In [None]:
!pip install -qU langchain langchain-community langchain-cohere langchain-pinecone langchain-google-genai google-generativeai

In [None]:
!pip install --upgrade langchain-pinecone

### PHASE-1

In [None]:
import pandas as pd
import numpy as np
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as LangChainPinecone
from pinecone import Pinecone, ServerlessSpec
import os
from typing import List, Dict
import json
import getpass
import os

In [None]:
# Pinecone AUTH
api_key_pinecone = getpass.getpass("Input API KEY PINECONE CLOUD")
print('api_key_pinecone telah diinput')

Load CSV into Pandas

In [None]:
# Load your file (assuming you uploaded it manually or via drive)
df_recipe = pd.read_csv("/content/External_knowledge_Recipe.csv")
df_recipe = df_recipe.dropna()
# df = df.iloc[:10,]
# print(df_ingredient.shape)
# print(df["__sheet__"].value_counts())
df_recipe.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Prepare Data for Embedding

In [None]:
# Add sheet label
df_recipe["__sheet__"] = "Recipe"
df_recipe.head()

Convert Rows to Text Documents for Embedding

In [None]:
def row_to_text(row):
    sheet = str(row.get("__sheet__", "")).strip()

    if sheet == "NutritionInfo":
        return (
            f"Nutrition facts for {row.get('name', 'unknown')}: "
            f"{row.get('calories', 'N/A')} calories, "
            f"{row.get('protein_g', 'N/A')}g protein, "
            f"{row.get('carbohydrates_total_g', 'N/A')}g carbohydrates, "
            f"{row.get('fat_total_g', 'N/A')}g fat, "
            f"{row.get('fiber_g', 'N/A')}g fiber."
        )

    elif sheet == "Ingredient":
        return (
            f"Ingredient: {row.get('name', 'unknown')}. "
            f"Quantity: {row.get('quantity', 'unspecified')}. "
        )

    elif sheet == "Recipe":
        return (
            f"Recipe Title: {row.get('title', 'Untitled')}. "
            f"Description: {row.get('description', '')}. "
            f"Ingredients: {row.get('ingredients', '')}. "
            f"Instructions: {row.get('instructions', '')}. "
            f"Nutrition Score: {row.get('nutrition', 'unknown')}. "
        )

    return "Uncategorized entry."

# Apply to all rows
texts = df_recipe.apply(row_to_text, axis=1).tolist()
metadatas = df_recipe.to_dict(orient="records")

print(f"Converted {len(texts)} rows to documents.")

Setup Gemini AI Embedding Model

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [None]:
# Gemini AI AUTH
import getpass
import os

if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

In [None]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

Chunk the Documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)

chunks = []
chunk_metadata = []

for i, text in enumerate(texts):
    splits = text_splitter.split_text(text)
    chunks.extend(splits)
    chunk_metadata.extend([metadatas[i]] * len(splits))

Embedding and Store to Pinecone

In [None]:
from langchain.vectorstores import Pinecone  # <--- THIS is LangChain's wrapper
import pinecone  # This is the official SDK
from time import sleep

In [None]:
embeddings = embedding_model.embed_documents(texts)
print(f"Generated {len(embeddings)} embeddings.")

In [None]:
# Initialize Pinecone
pinecone_api_key = "API KEY PINECONE"
pc = pinecone.Pinecone(api_key=pinecone_api_key)

index_name = "nutrition-rag-index"

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

# index_name = 'nutrition-rag-index'
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# Create vector index if not exist
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=spec
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# Connect to index
index = pc.Index(index_name)

Format Vectors + Metadata

In [None]:
# Each entry must be a tuple of (id, vector, metadata)
vectors = [
    {
        "id": f"vec-{i}",
        "values": embeddings[i],
        "metadata": chunk_metadata[i]
    }
    for i in range(len(embeddings))
]

In [None]:
# Batch upsert (up to 100 vectors per call is safe)
for i in range(0, len(vectors), 100):
    batch = vectors[i:i+100]
    index.upsert(vectors=batch)

print("Vector embeddings created successfully!")
print(f"Stored {len(texts)} document chunks with enhanced metadata")

### PHASE-2

Search with LangChain Retriever

In [None]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# LangChain wrapper around Pinecone index
vectorstore = Pinecone(
    index=index,               # from pinecone.Index
    embedding=embedding_model
)

Create a Retriever (with optional filter)

In [None]:
# Optional filter to only retrieve recipe chunks
retriever = vectorstore.as_retriever(
    search_kwargs={
        "k": 5,
        "filter": {"__sheet__": "Recipe"}
    }
)

Run a Semantic Query with the Retriever

In [None]:
query = "low-carb high-protein chicken dinner recipe"
results = retriever.get_relevant_documents(query)

# Show top results
for i, doc in enumerate(results):
    print(f"\n Recipe Match #{i+1}")
    print("Title:", doc.metadata.get("title", "Untitled"))
    print("Meal Type:", doc.metadata.get("meal_type", "Unknown"))
    print("Category:", doc.metadata.get("category", "Unknown"))
    print("Nutrition Score:", doc.metadata.get("nutrition", "N/A"))
    print("Ingredients:", doc.metadata.get("ingredients", ""))
    print("Instructions:", doc.metadata.get("instructions", ""))
    print("-" * 60)

### Option 2: use cohere - CohereEmbeddings for embedding model (paid model from cohere: models/embed-english-v3.0, but it's rate-limited and not intended for production use)