In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [7]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud='aws', region='us-east-1'),
)

In [2]:
import json
data = json.load(open("reviews.json"))
# data['reviews']

In [9]:
processed_data = []
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = "text-embedding-3-small",

    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values":embedding,
        "id": review["professor"],
        "metadata": {
            "subject": review["subject"],
            "stars": review["stars"],
            "review": review["review"]
        }
    })

In [10]:
processed_data[0]

{'values': [-0.037024245,
  -0.0070935492,
  0.0023873749,
  0.03018586,
  0.018499365,
  0.0027733098,
  0.006098411,
  0.0590959,
  -0.0074635367,
  0.010959279,
  0.03988208,
  -0.00081293547,
  -0.031078933,
  -0.0031145914,
  -0.010232062,
  0.042663362,
  -0.045163967,
  -0.0048895734,
  0.037815254,
  0.06817973,
  0.0066151177,
  -0.007425262,
  0.029369336,
  -0.028042486,
  -0.047562506,
  -0.041642707,
  0.014761217,
  0.0005282685,
  0.0450619,
  -0.008975381,
  0.09435952,
  -0.0004401573,
  -0.007878178,
  -0.032890595,
  -0.029420368,
  0.010965657,
  0.003524448,
  0.046082556,
  0.0065959804,
  -0.0028626171,
  -0.023424024,
  0.005208528,
  -0.026332889,
  0.005495587,
  0.030900318,
  0.0026776236,
  -0.00414322,
  -0.014161583,
  0.033171274,
  0.038555227,
  -0.046924595,
  -0.0038370234,
  0.00308748,
  0.0041527883,
  -0.05343127,
  0.007693184,
  0.0070680333,
  0.033605054,
  0.016521847,
  -0.0035499642,
  0.04843006,
  -0.0016378317,
  -0.012975072,
  -0.0119

In [11]:
index = pc.Index('rag')
index.upsert(
    vectors = processed_data,
    namespace = 'ns1',
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}