In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [11]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
  name="rag2", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [3]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. John Smith',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Dr. Smith explains concepts clearly, but his exams are challenging.'},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Physics',
  'stars': 5,
  'review': "One of the best professors I've ever had. Makes physics fun and engaging."},
 {'professor': 'Dr. Emily Brown',
  'subject': 'Computer Science',
  'stars': 3,
  'review': 'The lectures are informative, but the homework assignments are overwhelming.'},
 {'professor': 'Dr. Michael Davis',
  'subject': 'Chemistry',
  'stars': 2,
  'review': 'Not the most engaging lecturer, and the labs are poorly organized.'},
 {'professor': 'Dr. Linda Wilson',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Dr. Wilson is incredibly knowledgeable and always willing to help.'},
 {'professor': 'Dr. James Martinez',
  'subject': 'History',
  'stars': 4,
  'review': 'Great storytelling in lectures, but the exams require a lot of memorization.'},
 {'professor': 'Dr. 

In [4]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embeddings(text):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad(): 
        output = model(**encoded_input)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

processed_data = []
for review in data['reviews']:
    embedding = get_embeddings(review['review'])
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [5]:
processed_data[0]

{'values': array([ 1.29273549e-01,  3.27674359e-01, -1.20942757e-01, -2.34429041e-04,
         4.49604511e-01, -2.49305502e-01,  4.49233830e-01,  3.39336812e-01,
         4.58221525e-01, -9.79243666e-02,  1.42521143e-01,  3.60595621e-02,
         5.20489514e-01,  2.99629122e-01, -1.81778744e-01,  2.32710704e-01,
         3.14123750e-01, -3.50115359e-01, -4.29511056e-05, -3.57411951e-02,
        -8.81426781e-02,  1.04747362e-01, -2.96441704e-01,  5.52168727e-01,
         5.73980689e-01, -1.01495767e-02, -1.17868306e-02, -6.87154606e-02,
        -1.24880008e-01, -2.66246825e-01,  9.70722884e-02, -1.08653642e-01,
        -1.46859005e-01,  4.10215259e-02, -5.13049103e-02, -9.31987017e-02,
         8.10907558e-02,  1.42837226e-01, -9.71772894e-02,  1.00404881e-01,
        -4.11267191e-01, -1.45551944e-02,  1.19359478e-01,  2.47888863e-01,
        -8.58192965e-02, -3.26653779e-01,  2.02006757e-01, -3.55395377e-01,
         7.59122819e-02, -1.83356717e-01, -5.56320310e-01,  4.81051266e-01,
  

In [6]:

for item in processed_data:
    item['values'] = item['values'].tolist()


In [7]:
processed_data[0]

{'values': [0.12927354872226715,
  0.32767435908317566,
  -0.12094275653362274,
  -0.00023442904057446867,
  0.44960451126098633,
  -0.24930550158023834,
  0.4492338299751282,
  0.3393368124961853,
  0.45822152495384216,
  -0.0979243665933609,
  0.14252114295959473,
  0.03605956211686134,
  0.520489513874054,
  0.2996291220188141,
  -0.18177874386310577,
  0.23271070420742035,
  0.3141237497329712,
  -0.3501153588294983,
  -4.295110556995496e-05,
  -0.03574119508266449,
  -0.08814267814159393,
  0.10474736243486404,
  -0.2964417040348053,
  0.5521687269210815,
  0.5739806890487671,
  -0.01014957670122385,
  -0.011786830611526966,
  -0.06871546059846878,
  -0.12488000839948654,
  -0.26624682545661926,
  0.0970722883939743,
  -0.10865364223718643,
  -0.14685900509357452,
  0.041021525859832764,
  -0.05130491033196449,
  -0.09319870173931122,
  0.08109075576066971,
  0.14283722639083862,
  -0.09717728942632675,
  0.10040488094091415,
  -0.4112671911716461,
  -0.014555194415152073,
  0.119

In [12]:
index=pc.Index('rag2')
index.upsert(
  vectors=processed_data,
  namespace="ns1"
)

{'upserted_count': 20}

In [13]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}