In [13]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [8]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
  name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [10]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Richards',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Dr. Richards is amazing! Her lectures are clear and she is very approachable.'},
 {'professor': 'Dr. John Anderson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Great professor, but his exams are tough. Study hard!'},
 {'professor': 'Dr. Lisa Wong',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'She knows her stuff, but the lectures can be a bit dry.'},
 {'professor': 'Dr. Michael Thompson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Excellent teacher! Makes complex topics easy to understand.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Literature',
  'stars': 4,
  'review': 'Passionate about literature, but grading can be harsh.'},
 {'professor': 'Dr. Robert Lewis',
  'subject': 'History',
  'stars': 3,
  'review': "His lectures are interesting, but he's not very responsive to emails."},
 {'professor': 'Dr. Rachel Evans',
  'subject': 'Economics',
  'stars': 5,
  'r

In [14]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
  response = client.embeddings.create(
    input=review['review'],
    model="text-embedding-3-small",
  )
  embedding = response.data[0].embedding
  processed_data.append({
    "values": embedding,
    "id": review["professor"],
    "metadata": {
      "review": review["review"],
      "subject": review["subject"],
      "stars": review["stars"]
    }
  })


In [15]:
processed_data[0]

{'values': [-0.011544138,
  -0.015952855,
  0.015939714,
  0.056242358,
  -0.001389633,
  -0.019632263,
  -0.0195797,
  0.034087073,
  -0.035479993,
  -0.018962085,
  0.030959578,
  -0.0011457081,
  0.0049343472,
  -0.00033816867,
  -0.002038457,
  0.009297072,
  -0.018199923,
  -0.008127547,
  0.038896583,
  0.0673857,
  0.012089479,
  -0.024441773,
  0.054087278,
  -0.009191946,
  -0.05145913,
  -0.023351092,
  0.0349018,
  0.0011079285,
  0.026465446,
  0.04438941,
  0.032063402,
  -0.01335099,
  -0.012069768,
  -0.0007662693,
  -0.067175455,
  0.049461737,
  -0.02261521,
  0.029934602,
  -0.0095598865,
  -0.0049212063,
  0.016662456,
  0.0021600088,
  -0.008554621,
  0.04436313,
  0.037871607,
  -0.015637478,
  -0.020946337,
  0.0043955767,
  0.032851845,
  0.025571875,
  -0.08352253,
  0.011872657,
  0.03361401,
  -0.03187943,
  -0.07495477,
  0.036268435,
  0.040552318,
  0.049356613,
  -0.0076281982,
  -0.018843818,
  0.036899194,
  -0.021839907,
  -0.020867491,
  -0.0018462737,

In [16]:
index = pc.Index('rag')
index.upsert(
  vectors=processed_data,
  namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}