In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [3]:
pinecone_ = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index_name = 'rag'
namespace = ''
pinecone_.create_index(
    name=index_name,
    dimension=1536, 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [4]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Thorpe',
  'subject': 'Biology',
  'stars': 4,
  'review': "Dr. Thorpe's lectures are engaging and informative. She really knows how to make complex topics accessible."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Brilliant instructor! Prof. Chen's passion for AI is contagious. His projects are challenging but rewarding."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Psychology',
  'stars': 3,
  'review': "Dr. Johnson's research is cutting-edge, but her lectures can be a bit dry. Office hours are helpful though."},
 {'professor': 'Prof. David Martinez',
  'subject': 'History',
  'stars': 4,
  'review': 'Prof. Martinez brings history to life with his storytelling. His exams are tough but fair.'},
 {'professor': 'Dr. Rachel Wong',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Dr. Wong is an excellent teacher. Her lab demonstrations are always exciting and memorable.'},
 {'professor': 'Prof. Jonat

In [5]:
process_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model='text-embedding-3-small',
    )
    embedding = response.data[0].embedding
    process_data.append({
        'values': embedding,
        'id': review['professor'],
        'metadata': {
            'review': review['review'],
            'subject': review['subject'],
            'stars': review['stars']
        }
    })

In [8]:
process_data[0]

{'values': [0.0058550057,
  -0.033283178,
  -0.01950124,
  0.013596588,
  0.034924828,
  0.018680414,
  -0.04114722,
  0.039849784,
  0.0058450764,
  -0.022678632,
  0.02422761,
  0.012444784,
  0.025670676,
  -0.03781096,
  0.014165871,
  0.020904588,
  -0.0081751635,
  -0.019157022,
  0.049885046,
  0.05438635,
  0.046998918,
  -0.0028248997,
  0.031747438,
  -0.020944307,
  -0.059946787,
  -0.021976959,
  0.02699459,
  0.028993698,
  -0.00043854624,
  0.021315001,
  0.05745783,
  -0.013146457,
  -0.018111132,
  -0.028411176,
  0.0052228374,
  0.042577047,
  0.008267838,
  0.030900132,
  0.025988415,
  0.010677359,
  0.058622874,
  -0.024810132,
  -0.045648526,
  0.02590898,
  0.008003055,
  0.0057821907,
  -0.010783273,
  -0.0054644514,
  0.022572719,
  0.017131437,
  -0.04284183,
  -0.0032055248,
  -0.017780153,
  -0.018720131,
  -0.08022913,
  0.03002635,
  -0.001966011,
  0.046389915,
  -0.01781987,
  -0.028411176,
  0.020004328,
  0.0005266692,
  -0.0032981986,
  0.012636751,
  

In [9]:
index = pinecone_.Index('rag')
index.upsert(
    vectors=process_data,
    namespace="namespace"
)

{'upserted_count': 20}

In [7]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}