In [27]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [11]:
import json 
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Johnson's lectures are engaging and informative. Her passion for biology is contagious!"},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Brilliant professor. Makes complex algorithms seem easy. Always available for extra help.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Psychology',
  'stars': 3,
  'review': 'Interesting content, but lectures can be dry. Office hours are helpful.'},
 {'professor': 'Prof. David Garcia',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Challenging course, but Prof. Garcia explains concepts clearly. Fair grader.'},
 {'professor': 'Dr. Amanda Lee',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Best professor in the department! Lab sessions are well-organized and insightful.'},
 {'professor': 'Prof. Robert Taylor',
  'subject': 'History',
  'stars': 2,
  'review': 'Monotonous lectures. Relies to

In [28]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [29]:
processed_data[0]

{'values': [0.04884758219122887,
  -0.011375798843801022,
  0.029513606801629066,
  0.06019897013902664,
  -0.025021875277161598,
  0.004085889086127281,
  0.022531893104314804,
  0.058392513543367386,
  -0.02363041415810585,
  -0.0006003724411129951,
  0.03861912712454796,
  -0.012510937638580799,
  -0.020847493782639503,
  0.018259866163134575,
  -0.003905853722244501,
  -0.00104283238761127,
  -0.007311269640922546,
  -0.033785633742809296,
  0.027023624628782272,
  0.07015889883041382,
  0.0539984256029129,
  -0.041646163910627365,
  0.017722811549901962,
  -0.014769009314477444,
  -0.032101236283779144,
  -0.0620054267346859,
  0.009520518593490124,
  0.03503062576055527,
  0.035299152135849,
  -0.03197917714715004,
  0.05941779911518097,
  -0.0004230070044286549,
  -0.014537098817527294,
  -0.014769009314477444,
  -0.023789089173078537,
  0.03666619956493378,
  0.0012182906502857804,
  -0.010649554431438446,
  0.011131683364510536,
  0.02222674898803234,
  0.03573856130242348,
  

In [30]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}