In [24]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
import pinecone
from pinecone import Pinecone, ServerlessSpec

In [25]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "rag"
if not pc.describe_index(index_name):
    pc.create_index(
        name="rag",
        dimension=1536, # the number of dimensions OpenAI has, change based on model
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [26]:
import json
data = json.load(open("reviews.json"))

In [29]:
processed_data = []
client = OpenAI()
# using embeddings, which capture the semantic aspect of the text.
# captures how closely embedded certain texts are

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [31]:
processed_data[0]

index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

{'upserted_count': 20}

In [32]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}