In [24]:
from dotenv import load_dotenv
load_dotenv()

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [12]:
pc = Pinecone("")
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)



In [13]:
import json

data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great lecturer, very knowledgeable and approachable.'},
 {'professor': 'Dr. Lisa Taylor',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Excellent teacher! Makes difficult topics easy to understand.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Good explanations, but the lectures can be a bit dry.'},
 {'professor': 'Dr. Susan Johnson',
  'subject': 'Biology',
  'stars': 2,
  'review': "Not very engaging and doesn't respond to emails promptly."},
 {'professor': 'Dr. Robert Lee',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Fantastic professor! Very passionate about the subject.'},
 {'professor': 'Dr. Emma Davis',
  'subject': 'English Literature',
  'stars': 4,
  'review': 'Interesting lectures and fair grading.'},
 {'professor': 'Dr. James Wilson',
  'subject': 'Economics',
  'stars': 1,
  'review': 'Very hard exams and unclear grading

In [31]:
processed_data = []
client = OpenAI(
    api_key = "",
)

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    
    embedding = response.data[0].embedding
    
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata": {
                "professor": review["professor"],
                "subject": review["subject"],
                "stars": review["stars"],
                "review": review["review"],
            }
        }
    )
    



In [32]:
processed_data[0]

{'values': [-0.035858747,
  -0.03616242,
  -0.055724036,
  0.0071363207,
  0.010312237,
  0.004887241,
  -0.03707344,
  0.04071752,
  -0.021371003,
  -0.007098362,
  0.026925692,
  -0.017701618,
  -0.01014142,
  -0.014690192,
  0.008572442,
  -0.020751003,
  -0.022003656,
  0.0011435195,
  0.03459344,
  0.032113444,
  0.031025281,
  -0.01749917,
  0.017271414,
  0.026115898,
  -0.023522023,
  -0.052231796,
  0.033935484,
  -0.0029054568,
  -0.00071924686,
  0.041704457,
  0.09135503,
  -0.010457747,
  -0.024521613,
  0.01965019,
  -0.03264487,
  0.055369753,
  0.008483871,
  0.029380385,
  -0.014411826,
  -0.0009157646,
  0.049068533,
  0.003564997,
  0.003767446,
  0.013032643,
  0.01629713,
  -0.038819563,
  -0.02844406,
  0.014867336,
  0.03292324,
  0.050865266,
  -0.010856318,
  -0.020181617,
  0.06326526,
  0.026470182,
  -0.04026201,
  -0.0029924465,
  -0.009970604,
  -0.00022894115,
  0.023218349,
  -0.027229367,
  0.04403262,
  0.015133049,
  0.033075076,
  -0.013336316,
  -0.

In [33]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)


{'upserted_count': 20}

In [34]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}