In [7]:
from dotenv import load_dotenv
load_dotenv()
import os

from openai import OpenAI

from pinecone import Pinecone, ServerlessSpec

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws",region="us-east-1")
)

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [6]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Chen',
  'subject': 'Physics',
  'stars': 5,
  'review': "Dr. Chen's lectures are incredibly engaging. She explains complex concepts with clarity and enthusiasm."},
 {'professor': 'Prof. Michael Johnson',
  'subject': 'History',
  'stars': 4,
  'review': 'Prof. Johnson brings history to life with his storytelling. However, his assignments can be quite demanding.'},
 {'professor': 'Dr. Sarah Thompson',
  'subject': 'Biology',
  'stars': 5,
  'review': "Dr. Thompson's passion for biology is contagious. Her lab sessions are particularly well-organized and informative."},
 {'professor': 'Prof. David Lee',
  'subject': 'Mathematics',
  'stars': 3,
  'review': 'Prof. Lee knows his subject well, but sometimes struggles to explain concepts to non-math majors.'},
 {'professor': 'Dr. Rachel Green',
  'subject': 'Psychology',
  'stars': 5,
  'review': "Dr. Green's classes are always thought-provoking. She encourages critical thinking and class participation."},
 {'profes

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values":embedding,
        "id":review['professor'],
        "metadata":{
            "review":review['review'],
            "subject":review['subject'],
            "stars":review['stars']
        }
    })

In [9]:
processed_data[0]

{'values': [-0.00136058,
  -0.0126322955,
  -0.0034006194,
  0.052575868,
  0.024174798,
  0.012884809,
  0.0013447979,
  0.0789967,
  -0.03168374,
  0.0059440234,
  0.008764859,
  -0.0020333943,
  0.00030276642,
  -0.0053725466,
  0.016293734,
  0.02272617,
  -0.034740474,
  -0.04436255,
  0.03107239,
  0.021277543,
  0.039338868,
  -0.012751907,
  0.050130475,
  -0.0005860129,
  -0.054277007,
  -0.06889618,
  0.027351147,
  0.024626663,
  0.025902519,
  0.011469407,
  0.042369023,
  -0.018911894,
  -0.01928402,
  -0.04539918,
  -0.050422862,
  0.087874524,
  -0.03843514,
  0.007821258,
  0.0011836548,
  -0.025224721,
  0.052575868,
  0.024972208,
  -0.035883427,
  0.03447467,
  0.047445863,
  -0.012379782,
  -0.054330166,
  0.0099941995,
  0.012851583,
  0.02692586,
  -0.06581286,
  0.027324565,
  0.013675573,
  -0.0062231165,
  -0.057254,
  0.038063012,
  0.017782232,
  0.06485597,
  -0.028945966,
  -0.018725833,
  0.06528126,
  -0.026633477,
  -0.0019353793,
  0.0069108824,
  -0.05

In [10]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [11]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}