In [10]:
from dotenv import load_dotenv
load_dotenv()
import os

from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [3]:
# pinecone index, one seperate db
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Great professor, explains concepts clearly and is always available for help.'},
 {'professor': 'Dr. Jane Doe',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Challenging course but Dr. Doe makes it interesting with practical examples.'},
 {'professor': 'Dr. Alan Johnson',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Good lecturer but sometimes hard to follow. Labs are well-organized.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Engaging lectures and supportive during office hours.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Excellent professor! His coding exercises really helped me understand the material.'},
 {'professor': 'Dr. Susan Taylor',
  'subject': 'History',
  'stars': 2,
  'review': 'Lectures are dry and exams are tough. Not very responsive to emails.'},
 {'professor': 'Dr. William 

In [11]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [12]:
processed_data[0]

{'values': [-0.059673928,
  -0.024431543,
  -0.013668329,
  0.027146159,
  -0.0060096597,
  -0.022657517,
  -0.0072151637,
  0.031980082,
  -0.035361446,
  -0.017918846,
  0.021574054,
  0.017621191,
  -0.019847652,
  -0.02296708,
  0.020788243,
  0.026717536,
  -0.0078878645,
  0.02950359,
  0.014906575,
  0.049434584,
  -0.008780831,
  -0.017871222,
  0.02026437,
  0.033956513,
  -0.041766986,
  -0.02497923,
  0.0049142884,
  0.028741593,
  0.010429841,
  0.0074354284,
  0.09115395,
  -0.00425647,
  -0.0264556,
  -0.043338604,
  -0.032980204,
  0.038123686,
  -0.012561052,
  0.006685337,
  0.009763093,
  0.03333739,
  -0.006965133,
  0.045100726,
  -0.013287331,
  0.009030861,
  0.050529957,
  -0.023848139,
  -0.06362679,
  -0.021454992,
  0.017680723,
  0.03059896,
  -0.03390889,
  -0.009489249,
  0.036337756,
  0.034813758,
  -0.0385285,
  0.026812786,
  0.039314307,
  -0.011358525,
  0.012453896,
  -0.010935854,
  0.027908158,
  0.019716684,
  -0.015478074,
  -0.0010574203,
  -0.0

In [13]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}