In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from openai.types import CreateEmbeddingResponse, Embedding

  from tqdm.autonotebook import tqdm


In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'John Smith',
  'review': 'Great professor, explains concepts clearly and is very approachable.',
  'subject': 'Math',
  'stars': 5},
 {'professor': 'Susan Johnson',
  'review': 'Engaging lectures and always willing to help students.',
  'subject': 'Physics',
  'stars': 4},
 {'professor': 'Michael Brown',
  'review': 'Makes difficult topics easy to understand.',
  'subject': 'Computer Science',
  'stars': 5},
 {'professor': 'Emily Davis',
  'review': 'Her classes are well-organized and informative.',
  'subject': 'History',
  'stars': 4},
 {'professor': 'Christopher Garcia',
  'review': 'Enthusiastic about the subject and very knowledgeable.',
  'subject': 'Chemistry',
  'stars': 5},
 {'professor': 'Jessica Martinez',
  'review': 'Always ready to answer questions and provide feedback.',
  'subject': 'Biology',
  'stars': 4},
 {'professor': 'David Wilson',
  'review': 'His teaching style is engaging and easy to follow.',
  'subject': 'Economics',
  'stars': 5},
 {'profess

In [7]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

Upserted count: 20
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
