In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json 
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Olivia Nguyen',
  'subject': 'Introduction to Computer Science',
  'stars': 4,
  'review': 'Professor Nguyen is an excellent lecturer who makes complex topics accessible. Her assignments are challenging but fair, and she is always available to provide extra help during office hours.'},
 {'professor': 'Professor Michael Goldstein',
  'subject': 'Organic Chemistry',
  'stars': 3,
  'review': 'While Professor Goldstein is knowledgeable about the subject matter, his lectures can be dry and he does not provide much guidance on how to approach the challenging problem sets. The exams are also very difficult.'},
 {'professor': 'Dr. Sarah Patel',
  'subject': 'Art History',
  'stars': 5,
  'review': 'Dr. Patel is passionate about art and her enthusiasm is infectious. Her lectures are engaging and she encourages students to think critically about the works we study. The reading load is manageable, and the assignments are interesting and thought-provoking.'},
 {'professor': 'P

In [4]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model='text-embedding-3-small',
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
        }
    })


In [5]:
processed_data[0]

{'values': [-0.04188234359025955,
  0.0004304235044401139,
  -0.011563434265553951,
  0.006972250062972307,
  0.03641199693083763,
  -0.021942440420389175,
  0.006587616633623838,
  0.03592357411980629,
  0.01619124971330166,
  -0.01372470986098051,
  -0.0025382775347679853,
  0.04009959474205971,
  -0.03839011490345001,
  -0.03394545614719391,
  -0.025251511484384537,
  0.048573751002550125,
  -0.04879353940486908,
  -0.021026646718382835,
  0.026912152767181396,
  0.00529024051502347,
  0.00871226005256176,
  0.0005269635003060102,
  0.04000191017985344,
  0.00899920891970396,
  -0.03375008702278137,
  -0.05685253441333771,
  0.03316397964954376,
  0.0042279185727238655,
  0.0009073999244719744,
  0.002904595574364066,
  0.03216271102428436,
  -0.004206550307571888,
  -0.0023871713783591986,
  0.014408502727746964,
  -0.01228385977447033,
  0.0734589472413063,
  0.02156391181051731,
  0.0034952829591929913,
  -0.00788804516196251,
  -0.008400890044867992,
  0.008706155233085155,
  0.

In [6]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

{'upserted_count': 21}

In [7]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}