In [8]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


In [5]:
pc.create_index(name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1") )

In [7]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Johnson',
  'subject': 'History',
  'stars': 4,
  'review': 'Dr. Johnson is very knowledgeable and passionate about history. Her lectures are engaging and informative.'},
 {'professor': 'Professor Michael Smith',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Professor Smith explains concepts well, but the exams are quite challenging.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Literature',
  'stars': 5,
  'review': "Dr. Lee's classes are a delight. She encourages critical thinking and open discussions."},
 {'professor': 'Professor James Brown',
  'subject': 'Math',
  'stars': 2,
  'review': "Professor Brown's lectures are hard to follow, and he doesn't provide enough examples."},
 {'professor': 'Dr. Linda Martinez',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'Dr. Martinez is very approachable and always willing to help students outside of class.'},
 {'professor': 'Professor Robert Wilson',
  'subject': 'Biology',
  'stars': 5,
  'review': 'P

In [10]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    print(embedding)
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

[0.016130514, -0.009933585, 0.026760869, 0.03354358, 0.004220695, 0.0040326286, 0.007220502, 0.059293207, -0.027352815, 0.03665129, 0.025108354, 0.003431434, -0.046245743, 0.031841736, 0.010766009, 0.037613202, -0.029498616, -0.0021920481, 0.03815582, 0.04096756, 0.044938527, -0.003314278, 0.02261725, -0.03356824, -0.032458346, -0.033518914, 0.009064165, 0.013787397, 0.049920738, -0.024639731, 0.08124452, 0.005114779, -0.016623802, 0.0023523667, -0.006924529, 0.026242917, 0.02440542, 0.01817766, 0.028142076, 0.009107328, 0.012899479, -0.031570427, -0.028092746, 0.015933199, 0.036404647, -0.025897615, 0.019312222, -0.014527329, 0.026341574, 0.05347241, -0.00861404, -0.0087681925, 0.01947254, -0.040202964, -0.012443188, -0.019620527, 0.022037638, 0.02856137, 0.0029628105, -0.009205986, 0.046369065, -0.018498296, -0.008755861, -0.008879183, -0.012208876, -0.03322294, -0.00550016, 0.01010007, -0.0064867362, -0.066248566, 0.056974754, 0.010001413, -0.05268315, 0.014490332, -0.02086608, -0.0

In [11]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}