In [22]:
%pip install -q -U google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [5]:
# load environmental variables into nb
from dotenv import load_dotenv
load_dotenv()
import os
from pinecone import Pinecone, ServerlessSpec
# import the Python SDK
import google.generativeai as genai
# retrieve the API key from the environment
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
# configure the Gemini API using the retrieved key
genai.configure(api_key=GEMINI_API_KEY)

In [7]:
# retrieve the API key from the environment
pc = Pinecone(api_key = os.getenv("PINECONE_API_KEY"))
# create pinecone index
pc.create_index(
    name="rag", 
    dimension=768,  # gemini text embedding model supports up to 768 dimensions (length of the document embedding vector is 768)
    metric="cosine",  # used for semantic search
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'd77316014d3ec28e9a48cbf13c2d2b39', 'Date': 'Sun, 25 Aug 2024 18:53:11 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [8]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professorName': 'Dr. Sarah Johnson',
  'course': 'Introduction to Psychology',
  'rating': 4,
  'review': 'Dr. Johnson is very knowledgeable and makes the class engaging. Her examples are relatable and help in understanding complex concepts.'},
 {'professorName': 'Prof. Michael Chen',
  'course': 'Advanced Calculus',
  'rating': 5,
  'review': "Professor Chen is exceptional! His explanations are clear, and he's always willing to help during office hours. Challenging but rewarding course."},
 {'professorName': 'Dr. Emily Rodriguez',
  'course': 'World History',
  'rating': 3,
  'review': "The course content is interesting, but Dr. Rodriguez's lectures can be a bit dry. More interactive elements would improve the class."},
 {'professorName': 'Prof. David Thompson',
  'course': 'Organic Chemistry',
  'rating': 4,
  'review': 'Tough course, but Prof. Thompson makes it manageable. His practice problems are especially helpful for exam preparation.'},
 {'professorName': 'Dr. Lisa Patel',
 

In [9]:
processed_data = []

for review in data["reviews"]:
    # generate embedding
    response = genai.embed_content(
        model="models/text-embedding-004",
        content=review['review']
    )
    embedding = response['embedding']
    
    # add to the processed data
    processed_data.append({
        "values": embedding,
        "id": review["professorName"],
        "metadata": {
            "review": review["review"],
            "course": review["course"],
            "rating": review["rating"]
        }
    })

In [10]:
len(processed_data[0]['values'])
processed_data[19]

{'values': [0.08200847,
  0.012068306,
  -0.027874943,
  0.010626679,
  -0.03337673,
  -0.006424036,
  -0.031818718,
  0.02154729,
  0.018877499,
  0.029667746,
  0.05627522,
  0.022060944,
  0.02643932,
  0.0037484397,
  -0.0008335538,
  -0.014432067,
  0.029483806,
  0.0006963744,
  -0.04219324,
  0.011367902,
  0.032126654,
  -0.015777241,
  0.015773168,
  -0.022171846,
  0.014752186,
  -0.013767421,
  -0.0064005475,
  -0.018016031,
  0.033082124,
  -0.028021075,
  0.048203643,
  0.04166721,
  -0.009751196,
  -0.07281184,
  -0.03652561,
  0.0023495015,
  -0.00940853,
  -0.0147644915,
  -0.030199973,
  -0.07008442,
  -0.016028775,
  0.01991008,
  -0.028893145,
  0.026235146,
  -0.04129293,
  0.036412135,
  -0.028693464,
  0.08735985,
  -0.04599868,
  0.091042556,
  0.014021267,
  0.07767947,
  -0.04563229,
  0.024540069,
  -0.036789283,
  -0.011906124,
  -0.020014297,
  -0.080087036,
  0.024822526,
  -0.021503178,
  -0.017287845,
  -0.04080307,
  -0.026207043,
  -0.0582928,
  0.09116

In [11]:
index = pc.Index('rag')  # index is a collection in firebase
index.upsert(
    vectors=processed_data,
    namespace="ns1"  # namespace is a document in firebase
)

{'upserted_count': 20}

In [12]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}