In [15]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
import getpass
import time

In [17]:
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

index_name = "rag"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [37]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Smith is an excellent professor. Her lectures are always engaging and she is very knowledgeable about the subject matter.'},
 {'professor': 'Dr. Bruce Banner',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'The workload for this class was a bit heavy, but Dr. Smith was always available to provide extra help and support.'},
 {'professor': 'Dr. Yasin',
  'subject': 'Computer Science',
  'stars': 3,
  'review': 'The material covered in this class was challenging, but Dr. Smith did her best to explain the concepts clearly.'},
 {'professor': 'Dr. Bill',
  'subject': 'History',
  'stars': 4,
  'review': 'Dr. Lee is a passionate and engaging lecturer. His classes are always interesting and informative.'},
 {'professor': 'Dr. Robert Lee',
  'subject': 'History',
  'stars': 4,
  'review': 'The grading in this class was fair, and Dr. Lee was always willing to provide feedback and guidanc

In [38]:
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )




In [29]:
processed_data[0]

{'values': [0.010094559751451015,
  0.008104793727397919,
  -0.02593982219696045,
  0.021803351119160652,
  0.012297314591705799,
  0.01870940625667572,
  0.023854771628975868,
  0.0061878920532763,
  0.0008127213222905993,
  -0.01790228858590126,
  0.013014751486480236,
  -0.02647790126502514,
  -0.015918128192424774,
  0.006081397645175457,
  0.04219425097107887,
  0.021601572632789612,
  -0.021814562380313873,
  -0.012992331758141518,
  0.0014895220519974828,
  0.012330944649875164,
  0.04470527917146683,
  -0.02883199043571949,
  0.021175595000386238,
  -0.03779995068907738,
  -0.027800673618912697,
  -0.03266579285264015,
  0.018855134025216103,
  0.040804214775562286,
  0.020962605252861977,
  0.01239820383489132,
  0.07905256003141403,
  -0.0037132957950234413,
  0.0027156102005392313,
  0.00702303322032094,
  -0.0197855606675148,
  0.03405582532286644,
  -0.004231756087392569,
  0.007953459396958351,
  0.002621726831421256,
  0.007695630192756653,
  0.033966146409511566,
  0.00

In [39]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

{'upserted_count': 20}

In [40]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}