In [10]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

load_dotenv(dotenv_path='./.env.local')

True

In [8]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag-rmp", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [9]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Chen',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Chen explains complex concepts clearly. Her enthusiasm for physics is contagious!'},
 {'professor': 'Prof. Michael Johnson',
  'subject': 'Literature',
  'stars': 3,
  'review': 'Engaging lectures, but assignments can be vague. Office hours are helpful.'},
 {'professor': 'Dr. Sarah Williams',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Fantastic instructor! Her lab sessions are incredibly informative and well-organized.'},
 {'professor': 'Prof. David Lee',
  'subject': 'Mathematics',
  'stars': 2,
  'review': 'Lectures move too quickly. More practice problems would be beneficial.'},
 {'professor': 'Dr. Rachel Thompson',
  'subject': 'Psychology',
  'stars': 4,
  'review': 'Thought-provoking discussions in class. Dr. Thompson is always open to questions.'},
 {'professor': 'Prof. Robert Garcia',
  'subject': 'History',
  'stars': 5,
  'review': 'Passionate about the subject. Makes historical ev

In [12]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding=response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [13]:
processed_data[0]

{'values': [-0.004504634,
  -0.027119603,
  -0.053767104,
  0.057753738,
  0.032024212,
  0.0064028753,
  -0.017612005,
  0.051170547,
  -0.033571657,
  -0.014753168,
  0.010989471,
  0.024195196,
  -0.0062782927,
  -0.028850641,
  0.03226026,
  0.029506337,
  -0.022228107,
  -0.017598892,
  0.012491017,
  0.010773092,
  0.015408865,
  -0.025913121,
  0.048731357,
  0.0056062043,
  -0.05539323,
  -0.06153055,
  0.03690259,
  0.018661119,
  0.019120106,
  0.00449152,
  0.05138037,
  -0.020405272,
  0.0038128744,
  -0.02801135,
  -0.039446693,
  0.078368835,
  -0.044429988,
  0.009238762,
  -0.009776433,
  -0.025296766,
  0.016615346,
  0.02270021,
  -0.04172852,
  0.04157115,
  0.016602233,
  -0.017716916,
  -0.06793015,
  0.010674737,
  0.023788664,
  0.014517118,
  -0.0230674,
  0.006625812,
  0.01822836,
  -0.0037571404,
  -0.05549814,
  0.064625435,
  0.050724674,
  0.046423305,
  -0.004481685,
  -0.043590695,
  0.0330471,
  -0.032942187,
  -0.008786332,
  0.027119603,
  -0.04490209

In [16]:
index = pc.Index('rag-rmp')
index.upsert(
    vectors=processed_data,
    namespace="sample",
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'sample': {'vector_count': 20}},
 'total_vector_count': 20}