In [3]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
import json

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag-new",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [6]:
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Harris',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Dr. Harris is an excellent professor. Her lectures are clear, and she's always willing to help students.",
  'institution': 'University of Southern California'},
 {'professor': 'Dr. Michael Clark',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Good professor, but sometimes his explanations can be a bit hard to follow.',
  'institution': 'Massachusetts Institute of Technology'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Dr. Johnson knows her stuff, but her classes are very challenging.',
  'institution': 'Stanford University'},
 {'professor': 'Dr. Robert Wilson',
  'subject': 'History',
  'stars': 2,
  'review': 'Not very engaging, and the exams are difficult.',
  'institution': 'Harvard University'},
 {'professor': 'Dr. Linda Brown',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Fantastic professor! Makes chemistry fun and interesti

In [7]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], 
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
                "institution": review["institution"],
            }
        }
    )


In [8]:
processed_data[0]

{'values': [-0.03914279118180275,
  -0.025856386870145798,
  -0.0027557893190532923,
  0.05562053993344307,
  -0.006127594504505396,
  -0.009465478360652924,
  -6.280793058976997e-07,
  0.03226077929139137,
  0.000938270939514041,
  -0.002080071484670043,
  0.0179214496165514,
  -0.026312291622161865,
  -0.025422189384698868,
  -0.0095848822966218,
  0.007077398709952831,
  0.05453504994511604,
  -0.01940857246518135,
  0.0005386747070588171,
  0.006122167222201824,
  0.04819578304886818,
  0.013394953683018684,
  0.01915890909731388,
  0.03271668776869774,
  -0.039859216660261154,
  -0.009948521852493286,
  -0.03920792043209076,
  0.023207789286971092,
  0.024510378018021584,
  0.008906450122594833,
  0.04068418964743614,
  0.07954475283622742,
  -0.004255122970789671,
  -0.007392190862447023,
  0.009286371991038322,
  -0.023490017279982567,
  0.047283969819545746,
  -0.012743659317493439,
  -0.01960396021604538,
  0.03601657971739769,
  0.01802999898791313,
  -0.017324430868029594,
 

In [10]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag-new")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

Upserted count: 20
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
