In [16]:
import os
import sys

from dotenv import load_dotenv, find_dotenv
import numpy as np

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage


# new import
from langchain_google_genai import GoogleGenerativeAIEmbeddings # embedding from Google
from langchain_community.vectorstores import Chroma # access chroma from LangChain

In [5]:
# make sure GOOGLE_API_KEY (or any other llm provider, must exactly the same)
_ = load_dotenv(find_dotenv())
# or put it into variable and pass it to llm model 
# google_api_key = os.getenv('GOOGLE_API_KEY') # pass this to the model creation

# RAG Concept

# Embedding

At the heart of every LLM is embedding, which convert a string, into a vector representation. The idea is that with vector representation, we could compare two sentences similarity based on it's vector distance. 

In [7]:
EMBEDDING_MODEL = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004')

In [11]:
vec_query = EMBEDDING_MODEL.embed_query("Hello World")

In [17]:
vec_query = np.array(vec_query)

In [18]:
vec_query.shape # Google Text Embedding 004 embedding size is 768, different model have different size

(768,)

In [44]:
contents = [
    "Cuacanya sangat dingin hari ini",
    "Buku terbaru yang ada di toko sangat bagus",
    "Taman itu berada di daerah Jakarta Kota"
]
vec_contents = np.array([EMBEDDING_MODEL.embed_query(_) for _ in contents])

In [25]:
vec_contents.shape

(3, 768)

In [49]:
query = "Di dekat sini terdapat area hijau yang bagus dan rimbun"
vec_query = np.array(EMBEDDING_MODEL.embed_query(query))

In [50]:
# https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
# euclidean distance, the lower means more similar
np.linalg.norm(vec_query-vec_contents, axis=1)

array([0.96018157, 0.95823276, 0.78226486])