In [12]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")   
)

In [11]:
import json

data = json.load(open("reviews.json"))
data

{'reviews': [{'professor': 'Dr. Susan Blake',
   'subject': 'Computer Science',
   'stars': 5,
   'review': 'Dr. Blake is an amazing professor! She explains complex topics very clearly and is always available for help.'},
  {'professor': 'Prof. James Miller',
   'subject': 'Mathematics',
   'stars': 4,
   'review': 'Great teacher with a lot of knowledge, but sometimes the lectures move too quickly.'},
  {'professor': 'Dr. Emily Johnson',
   'subject': 'Psychology',
   'stars': 3,
   'review': 'Dr. Johnson is knowledgeable, but her lectures can be a bit dry. The exams are challenging but fair.'},
  {'professor': 'Prof. Robert Lee',
   'subject': 'History',
   'stars': 4,
   'review': 'Prof. Lee makes history come alive. His classes are engaging, but the reading load is heavy.'},
  {'professor': 'Dr. Karen Smith',
   'subject': 'Biology',
   'stars': 2,
   'review': "The material is interesting, but Dr. Smith's lectures are hard to follow and she rarely answers emails."},
  {'professor':

In [14]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id" : review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars" : review["stars"]
        }
    })


In [15]:
processed_data[0]

{'values': [-0.021008316,
  -0.02542739,
  -0.016955532,
  0.040197015,
  -0.0050453045,
  -0.028948467,
  0.008867684,
  0.010917708,
  -0.017558131,
  -0.037408512,
  0.019117804,
  0.00301005,
  -0.0039848452,
  -0.013422636,
  0.010533698,
  0.028877573,
  -0.043883514,
  0.025498284,
  -4.6778176e-05,
  0.055628322,
  0.016742848,
  -0.013706214,
  0.0436472,
  0.006120533,
  -0.020819265,
  -0.008418688,
  0.025805492,
  -0.011780255,
  -0.0012561567,
  0.026136331,
  0.057707883,
  -0.018337969,
  0.024812972,
  -0.039653495,
  -0.054494016,
  0.011585295,
  -0.022201702,
  -0.011183561,
  0.020571137,
  0.01736908,
  0.012406486,
  0.028877573,
  -0.031098926,
  0.022804303,
  0.019200515,
  0.0014533312,
  -0.049436893,
  -0.025285602,
  0.038873658,
  0.02946836,
  -0.054777592,
  0.029728305,
  -0.012181988,
  0.01533678,
  -0.054210436,
  0.09272962,
  0.026349014,
  -0.016092986,
  0.039275393,
  -0.0238559,
  0.064560995,
  0.032894913,
  -0.029586516,
  -0.0017516777,
  

In [17]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [18]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}