In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env.local')
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", 
    dimension=1536, 
    metric="cosine", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Watson',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Watson is an amazing professor! Her lectures are clear, and she is always willing to help outside of class.'},
 {'professor': 'Prof. John Miller',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Prof. Miller explains concepts well, but his exams can be challenging. Be prepared to study hard.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Dr. Lee knows her material, but her teaching style is a bit dry. Labs are very informative though.'},
 {'professor': 'Prof. Mark Brown',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Prof. Brown is knowledgeable, but his lectures are disorganized and hard to follow.'},
 {'professor': 'Dr. Lisa Carter',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Dr. Carter is passionate about biology and makes the classes engaging. The workload is manageable.'},
 {'professor': 'Prof. James Wilson',
  'subject': '

In [5]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [6]:
processed_data[0]

{'values': [-0.036011822521686554,
  -0.029358210042119026,
  0.002124983351677656,
  0.07229802012443542,
  -0.006156306713819504,
  -0.022898945957422256,
  -0.01534217968583107,
  0.0034554200246930122,
  -0.01496491301804781,
  -0.008699999190866947,
  0.013615898787975311,
  -0.01330722589045763,
  -0.012049670331180096,
  -0.023664912208914757,
  -0.005996254272758961,
  0.05171983689069748,
  0.006459263619035482,
  0.019617868587374687,
  -0.003615472698584199,
  0.040767669677734375,
  -0.02110407128930092,
  -0.023916423320770264,
  0.0185317974537611,
  -0.039052821695804596,
  0.003586891805753112,
  -0.038275424391031265,
  -0.015239289030432701,
  0.018771877512335777,
  0.01679408550262451,
  0.031095923855900764,
  0.0640210211277008,
  -0.009963271208107471,
  -0.007533901836723089,
  -0.011180813424289227,
  -0.03816109895706177,
  0.061688825488090515,
  -0.010569184087216854,
  -0.016348224133253098,
  -0.00025544100208207965,
  0.012289749458432198,
  -0.0259971059

In [7]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [8]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}