In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
import google.generativeai as genai
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(name="rag-prof", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [4]:
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Dr. Smith is very knowledgeable and explains concepts clearly. His lectures are engaging, but his exams are tough.'},
 {'professor': 'Prof. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Prof. Johnson is amazing! She makes complex topics simple and is always willing to help after class.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Chemistry',
  'stars': 3,
  'review': "Dr. Brown's lectures can be a bit dry, but the material is well-organized. His lab sessions are more interesting."},
 {'professor': 'Prof. Susan Miller',
  'subject': 'History',
  'stars': 5,
  'review': 'Prof. Miller is a fantastic storyteller. She brings history to life with her engaging lectures.'},
 {'professor': 'Dr. Michael Davis',
  'subject': 'Computer Science',
  'stars': 2,
  'review': "Dr. Davis knows his stuff, but his teaching style is very technical and hard to follow. Wouldn't recommend if you

In [20]:
# processed_data = []
# # hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# for review in data['reviews']:
#     # embedding = hf_embeddings.embed_query(review['review'])
#     embedding = model.encode(review['review'])

#     processed_data.append({
#         "values": embedding,
#         "id": review['professor'],
#         "metadata": {
#             "review": review['review'],
#             "subject": review['subject'],
#             "stars": review['stars']
#         }
#     })



In [9]:
processed_data = []

# print(str(result['embedding'])[:50], '... TRIMMED]')

for review in data['reviews']:
    embedding = genai.embed_content(
        model="models/text-embedding-004",
        content=review['review'],
        task_type="retrieval_document",
        title="Embedding of single string")    

    processed_data.append({
        "values": embedding['embedding'],
        "id": review['professor'],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [10]:
processed_data[0]

{'values': [0.008624158,
  0.017789654,
  -0.08453333,
  -0.02546286,
  0.0068894466,
  0.0016474205,
  0.027853524,
  0.058842406,
  -0.00031793342,
  0.05731818,
  0.05300966,
  0.028737905,
  0.06555499,
  0.044663504,
  -0.015209429,
  -0.055725034,
  0.053131703,
  -0.013752637,
  -0.08321442,
  0.026345648,
  -0.0077350195,
  -0.028938329,
  0.041791838,
  -0.05478856,
  -0.01661867,
  -0.0027928208,
  -0.008217581,
  -0.06234568,
  0.0061679133,
  -0.009169729,
  0.06770501,
  0.030020902,
  -0.023179011,
  0.0015828047,
  0.0136560025,
  0.06667564,
  0.011251162,
  0.010701771,
  0.049810465,
  -0.025159089,
  -0.054094296,
  0.032142207,
  -0.053900197,
  0.058667976,
  -0.063091785,
  -0.031251725,
  -0.0050302045,
  0.08069299,
  0.0023459191,
  0.0614004,
  0.04326721,
  0.06578986,
  -0.05715809,
  0.037889723,
  0.0033740667,
  -0.031646162,
  -0.028238572,
  -0.026267989,
  0.025854869,
  -0.013850571,
  -0.029203849,
  -0.023454173,
  -0.026590064,
  -0.052922275,
  0.

In [11]:
index = pc.Index('rag-prof')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}