<a href="https://colab.research.google.com/github/aipractices/genai-application-collab-excercise/blob/main/RAG_using_Embeddings_and_Similarity_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Getting Started

Here's how to get started:
- Sign up for [Google Cloud](https://console.cloud.google.com/) (if you haven't already). Enable the Gemini API
- Head over to [Google AI Studio](https://aistudio.google.com/apikey) and create API key for your project.

# Understanding Embeddings

Embeddings are the vector representation of a sentence, and used to find relation with a query and data. Like in below example there are some titles for specific content. Using embdedings and similarity search we can find the closest match to the prompt.

In [9]:
!pip install requests
!pip install beautifulsoup4 pandas numpy

import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import google.generativeai as genai
from google.colab import userdata

genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

model = 'models/embedding-001'

def pageEmbeddings(title, text):
  return genai.embed_content(model=model,
                             content=text,
                             task_type="retrieval_document",
                             title=title)["embedding"]


def knowledgeBase():
  pages = [
    {
      "title": "Open-sources and Community Applications",
      "content": "https://thinkuldeep.com/about/open-sources/"
    },
    {
      "title": "Live Streaming, Webinars",
      "content": "https://thinkuldeep.com/about/streaming/"
    },
    {
      "title": "Patents Granted",
      "content": "https://thinkuldeep.com/about/patents/"
    },
    {
      "title": "Awards received and public coverage",
      "content": "https://thinkuldeep.com/about/recognitions/"
    },
    {
      "title": "Me, my family and some moments, travel and trips",
      "content": "https://thinkuldeep.com/about/moments/"
    },
    {
      "title": "Book foreword, reviewed and authored",
      "content": "https://thinkuldeep.com/about/books/"
    }
  ]

  df = pd.DataFrame(pages)
  df.columns = ['Title', 'Url']

  df['Embeddings'] = df.apply(lambda row: pageEmbeddings(row['Title'], row['Url']), axis=1)
  return df


def findBestPage(query, dataframe):
  query_embedding = genai.embed_content(model=model,
                                        content=query,
                                        task_type="retrieval_query")
  dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding["embedding"])
  idx = np.argmax(dot_products)
  return dataframe.iloc[idx]['Url']


prompt = "Tell me in brief Kuldeep's streaming"

bestPage = findBestPage(prompt, knowledgeBase())

print(bestPage)

https://thinkuldeep.com/about/streaming/


# Implementing RAG using Similaliry Search on Embeddings

Let's build a chatbot, that gives better result based on best match and additional context. Ask specific questions about Kuldeep.

In [None]:
!pip install requests
!pip install beautifulsoup4 pandas numpy
from google.colab import userdata

import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import google.generativeai as genai


genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

model = genai.GenerativeModel("models/gemini-2.0-flash")

chat = model.start_chat()

model = 'models/embedding-001'
def pageEmbeddings(title, text):
  return genai.embed_content(model=model,
                             content=text,
                             task_type="retrieval_document",
                             title=title)["embedding"]
def knowledgeBase():
  pages = [
    {
      "title": "Open-sources and Community Applications",
      "content": "https://thinkuldeep.com/about/open-sources/"
    },
    {
      "title": "Live Streaming, Webinars",
      "content": "https://thinkuldeep.com/about/streaming/"
    },
    {
      "title": "Patents Granted",
      "content": "https://thinkuldeep.com/about/patents/"
    },
    {
      "title": "Awards received and public coverage",
      "content": "https://thinkuldeep.com/about/recognitions/"
    },
    {
      "title": "Me, my family and some moments, travel and trips",
      "content": "https://thinkuldeep.com/about/moments/"
    },
    {
      "title": "Book foreword, reviewed and authored",
      "content": "https://thinkuldeep.com/about/books/"
    }
  ]

  df = pd.DataFrame(pages)
  df.columns = ['Title', 'Url']

  df['Embeddings'] = df.apply(lambda row: pageEmbeddings(row['Title'], row['Url']), axis=1)
  return df


def findBestPage(query, dataframe):
  query_embedding = genai.embed_content(model=model,
                                        content=query,
                                        task_type="retrieval_query")
  dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding["embedding"])
  idx = np.argmax(dot_products)
  return dataframe.iloc[idx]['Url']

def retrive(url):
  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    text_content = ""
    for paragraph in soup.find_all("p"):
        text_content += paragraph.get_text(strip=True) + "\n"
    for h2 in soup.find_all("h2"):
        text_content += h2.get_text(strip=True) + "\n"
    for h3 in soup.find_all("h3"):
        text_content += h3.get_text(strip=True) + "\n"
    for h4 in soup.find_all("h4"):
        text_content += h4.get_text(strip=True) + "\n"

    #print(text_content)
    return text_content
  except requests.exceptions.RequestException as e:
    return f"Error fetching URL: {e}"
  except Exception as e:
    return f"An error occurred: {e}"

def augment(prompt, context):
  return f"{prompt}. Here is more context: {context}"

def generation(prompt):
  return chat.send_message(prompt)

def run_chatbot():
    print("Hello! ask me about Kuldeep Singh! Type 'quit' to exit.")

    generation(augment("", retrive("https://thinkuldeep.com/about/")))

    df = knowledgeBase();

    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            print("thinkuldeep: Goodbye!")
            break

        bestPageMatching = findBestPage(user_input, df)
        response = generation(augment(user_input, retrive(bestPageMatching)))
        print(f"thinkuldeep: {response.text} \n\n")

run_chatbot()

Hello! ask me about Kuldeep Singh! Type 'quit' to exit.
You: what his contribution to society
thinkuldeep: Okay, incorporating the additional context, here's an updated view of Kuldeep's contributions to society, with a focus on showcasing their impact:

**VIII. Societal Contributions & Community Engagement**

Kuldeep is deeply committed to contributing to society through knowledge sharing, mentoring, and support for diverse communities. His impact extends beyond his professional work, influencing the lives of individuals and fostering innovation.

*   **Knowledge Sharing & Education:**
    *   **Events & Articles:** Shared knowledge and experience with 50,000+ people through 60+ events and 130+ articles on technology, leadership, and motivation.
    *   **Publications:** Authored "Exploring the Metaverse" to promote responsible technology adoption.
    *   **Guest Lectures:** Frequent guest lecturer at universities such as NIT Calicut and DesignersX, inspiring and educating the next g