# Construir banco de dados vetorial
Obtêm dados do JSON de produtos e efetua o push pro database

## Preparar ambiente

In [1]:
# Instalar dependências
#!pip install pinecone

In [2]:
from pinecone import Pinecone, ServerlessSpec
import openai
import pandas as pd
import os
import dotenv
from sentence_transformers import SentenceTransformer # para embeddings

  from .autonotebook import tqdm as notebook_tqdm


### Variáveis de ambiente

In [3]:
dotenv.load_dotenv()

True

In [4]:
token = os.getenv("OPENROUTER_API_KEY") # token do OpenRouter
model_name = os.getenv("MODEL_NAME") # nome do modelo

pinecone_api_key = os.getenv("PINECONE_API_KEY") # token do Pinecone
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")  # nome do índice do Pinecone

# Modelo de embedding
embedding_model = os.getenv("EMBEDDING_MODEL_NAME")
# Carregar modelo localmente
embedding_client = SentenceTransformer(embedding_model)

### Iniciar Pinecone

In [5]:
pc = Pinecone(api_key=pinecone_api_key)

# client = OpenAI(api_key=token, base_url=open_ai_base_url)

# Teste de embeddings

Gerar **embeddings vetoriais** (transformar texto em vetores de números):

In [6]:
# Se usar RunPod: 
# output = client.embeddings.create(input=["Hello World!"], model=model_name)
# embedding = output.data[0].embedding

# Chama o modelo de embedding para transformar o texto em um valor numérico
output = embedding_client.encode(
    sentences=["Hello world"], # Lista de sentenças que serão codificadas
    convert_to_numpy=True # Converte a saída para um array do NumPy
)
# Como o sentence possui apenas 1 string, o output será um array 1D (1, 384), então acessamos o primeiro (e único) elemento do array
embedding = output[0]

print(embedding)
print("\nTamanho do vetor embedding: ", len(embedding))

[ 1.51960850e-02 -2.25706734e-02  8.54711980e-03 -7.41705820e-02
  3.83639405e-03  2.71344697e-03 -3.12679186e-02  4.46339697e-02
  4.40551378e-02 -7.87114259e-03 -2.52007879e-02 -3.33665088e-02
  1.44279227e-02  4.65381928e-02  8.55509657e-03 -1.61457192e-02
  7.40579702e-03 -1.90124381e-02 -1.14726298e-01 -1.81576442e-02
  1.26359358e-01  2.97029372e-02  2.52810232e-02 -3.42178792e-02
 -4.09996919e-02  6.61730114e-03  1.02706430e-02  2.23622601e-02
  4.43633646e-03 -1.27309680e-01 -1.61492061e-02 -2.03801412e-02
  4.72120903e-02  1.15798796e-02  6.81871623e-02  7.29864091e-03
 -1.78529993e-02  4.07821722e-02 -1.02694668e-02  2.37570591e-02
  1.06028914e-02 -2.85843518e-02  8.15969706e-03 -1.51805691e-02
  3.08962502e-02 -6.59799501e-02 -2.21965052e-02  5.40237539e-02
  2.54224031e-03  2.24527121e-02 -9.16537493e-02 -4.51403186e-02
 -4.19208128e-03 -5.62152732e-03 -5.38092293e-03  9.83934850e-02
  6.05247915e-02  7.42288399e-03  1.39386216e-02  2.68772594e-03
  4.75693829e-02  2.86365

# Wrangle Dataset
>"Limpeza, organização e limpeza de dados"

Prepara os dados brutos para serem usados em análise, machine learning ou visualização.

In [7]:
# Dataframe com os dados dos produtos
# Atributos: nome, categoria, descrição, ingredientes, preço, avaliação, imagem (arquivo no Firebase)
df = pd.read_json("../products/products.json", lines=True)

df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp


Gerar texto para o chatbot descrever o produto ao cliente:

In [8]:
# Adicionar coluna com as informações em texto do produto
df['text'] = df['name'] + " : " + df['description'] + \
"-- Ingredients: " + df['ingredients'].astype(str) + \
"-- Price: " + df['price'].astype(str) + \
"-- Rating: " + df['rating'].astype(str)

print(df['text'][0]) # Texto do primeiro produto
df['text'].head(3)

Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations.-- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']-- Price: 4.5-- Rating: 4.7


0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
2    Latte : Smooth and creamy, our latte combines ...
Name: text, dtype: object

In [9]:
# Armazenar textos em uma lista
texts = df['text'].tolist()

print('Comprimento: ', len(texts))
print(texts)

Comprimento:  18
["Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations.-- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']-- Price: 4.5-- Rating: 4.7", "Jumbo Savory Scone : Deliciously flaky and buttery, this jumbo savory scone is filled with herbs and cheese, creating a mouthwatering experience. Perfect for a hearty snack or a light lunch, it pairs beautifully with your favorite coffee or tea.-- Ingredients: ['Flour', 'Butter', 'Cheese', 'Herbs', 'Baking Powder', 'Salt']-- Price: 3.25-- Rating: 4.3", "Latte : Smooth and creamy, our latte combines rich espresso with velvety steamed milk, creating a perfect balance of flavor and texture. Enjoy it as a comforting treat any time of day, whether you're starting your morning or taking a midday break.-- Ingredients:

## Adicionar textos dos arquivos `.txt`

### Descrição da cafeteria

In [10]:
# Ler o arquivo com a descrição da cafeteria
with open("../products/Merry\'s_way_about_us.txt", 'r') as f:
    Marry_way_about_section = f.read() # Lê o arquivo

Marry_way_about_section = ("Coffee shop Marry's Way About Section", Marry_way_about_section)

# Adicionar seção About Us (título + descrição) à lista de textos
texts.append(Marry_way_about_section)

### Texto dos itens do menu

In [11]:
# Ler o arquivo com o texto dos itens
with open("../products/menu_items_text.txt", 'r') as f:
    menu_items_text = f.read()  # Lê o arquivo

menu_items_text = ("Menu Items:", menu_items_text)

# Adicionar seção About Us (título + descrição) à lista de textos
texts.append(menu_items_text)

In [12]:
# Imprimir lista de textos
for i, text in enumerate(texts):
    print(f"{i} | {text}")

0 | Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations.-- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam']-- Price: 4.5-- Rating: 4.7
1 | Jumbo Savory Scone : Deliciously flaky and buttery, this jumbo savory scone is filled with herbs and cheese, creating a mouthwatering experience. Perfect for a hearty snack or a light lunch, it pairs beautifully with your favorite coffee or tea.-- Ingredients: ['Flour', 'Butter', 'Cheese', 'Herbs', 'Baking Powder', 'Salt']-- Price: 3.25-- Rating: 4.3
2 | Latte : Smooth and creamy, our latte combines rich espresso with velvety steamed milk, creating a perfect balance of flavor and texture. Enjoy it as a comforting treat any time of day, whether you're starting your morning or taking a midday break.-- Ingredients: ['Espresso',

# Gerar os Embeddings

In [13]:
# Transforma os textos (descrição da cafeteria e itens do menu) em um vetor numérico
output = embedding_client.encode(texts, convert_to_numpy=True)

print(output)
print("\nTamanho do vetor embedding: ", len(output[0]))

[[-0.02510559  0.0026576   0.02661145 ...  0.04739087  0.07071644
   0.05902702]
 [-0.04500085 -0.01434042  0.08822834 ... -0.00404392  0.03852065
  -0.01553417]
 [-0.01892913 -0.03732952  0.02678928 ...  0.01115508 -0.01128877
   0.0982942 ]
 ...
 [ 0.00903736 -0.03548158  0.07310835 ... -0.01576524  0.03464822
   0.04195429]
 [ 0.04156391 -0.01293723  0.01440373 ...  0.03530694  0.00254492
  -0.00517276]
 [-0.0528118  -0.06622067  0.01041227 ... -0.05127174  0.05799047
  -0.03584244]]

Tamanho do vetor embedding:  384


In [14]:
# Copiar dados efetivos contidos em output
embeddings = output.data

print(type(embeddings))

<class 'memoryview'>


# Push dos dados para o banco de dados

**Índice vetorial serverless:** banco de dados otimizado para buscas por similaridade de vetores (como embeddings de texto).

In [None]:
# Criar índice vetorial serverless no Pinecone
pc.create_index(
    name=pinecone_index_name, # nome do índice a ser criado (usado para envio/busca de dados)
    dimension=384, # dimensão dos vetores armazenados (modelo 'bge-small-en-v1.5' gera vetores de 384 dimensões)  
    metric="cosine", # métrica de similaridade: cosseno (medir ângulo entre dois vetores)
    spec=ServerlessSpec( # especificação do índice usando Pinecone Serverless
        cloud="aws", # provedor de nuvem: AWS
        region="us-east-1",  # região da AWS: Norte da Virginia (precisa ser uma região que suporte o Pinecone Serverless)
    )
)

{
    "name": "coffeeshop",
    "metric": "cosine",
    "host": "coffeeshop-amklesp.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}