In [24]:
import os
import sys
from typing import Optional, Any

from dotenv import load_dotenv, find_dotenv
import numpy as np

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage


# new import
from langchain_google_genai import GoogleGenerativeAIEmbeddings # embedding from Google
from langchain_community.vectorstores import Chroma # access chroma from LangChain

In [10]:
# make sure GOOGLE_API_KEY (or any other llm provider, must exactly the same)
_ = load_dotenv(find_dotenv())
# or put it into variable and pass it to llm model 
# google_api_key = os.getenv('GOOGLE_API_KEY') # pass this to the model creation

# RAG Concept

# Embedding

At the heart of every LLM is embedding, which convert a string, into a vector representation. The idea is that with vector representation, we could compare two sentences similarity based on it's vector distance. 

In [11]:
EMBEDDING_MODEL = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004')

In [12]:
vec_query = EMBEDDING_MODEL.embed_query("Hello World")

In [5]:
vec_query = np.array(vec_query)

In [7]:
vec_query

array([ 1.41921565e-02, -9.10687726e-03, -4.07615900e-02,  3.78295407e-03,
       -5.71815018e-03, -5.98200085e-03,  5.99906370e-02,  2.03863699e-02,
        2.55044959e-02,  4.73048165e-02, -4.99246120e-02, -1.69505260e-03,
        3.48231532e-02, -1.07618682e-02, -2.28640344e-02, -1.78383943e-02,
       -8.82386509e-03,  1.31277665e-02, -1.09334208e-01,  1.41559495e-02,
        6.40330696e-03,  7.62484781e-03, -2.09325720e-02, -4.91950065e-02,
       -1.12417219e-02, -4.25484381e-04,  1.31445769e-02,  2.33667046e-02,
        1.38423229e-02,  4.45795283e-02, -4.18479554e-02,  5.61341196e-02,
       -2.10907334e-03, -2.36702487e-02,  6.81589311e-03,  6.86355727e-03,
       -5.09277508e-02,  2.82000564e-02,  7.36088492e-03, -5.37831187e-02,
       -2.64719054e-02,  3.88621613e-02,  7.20694754e-03,  1.37184178e-02,
        5.10124974e-02, -2.18461026e-02,  2.15280335e-02, -2.06419057e-03,
        1.32721965e-03,  5.67283598e-04,  4.12434302e-02,  1.10045355e-02,
       -9.87707302e-02,  

In [6]:
vec_query.shape # Google Text Embedding 004 embedding size is 768, different model have different size

(768,)

In [8]:
contents = [
    "Cuacanya sangat dingin hari ini",
    "Buku terbaru yang ada di toko sangat bagus",
    "Taman itu berada di daerah Jakarta Kota"
]
vec_contents = np.array([EMBEDDING_MODEL.embed_query(_) for _ in contents])

In [9]:
vec_contents.shape

(3, 768)

In [10]:
query = "Di dekat sini terdapat area hijau yang bagus dan rimbun"
vec_query = np.array(EMBEDDING_MODEL.embed_query(query))

In [11]:
# https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
# euclidean distance, the lower means more similar
np.linalg.norm(vec_query-vec_contents, axis=1)

array([0.96018157, 0.95823276, 0.78226486])

# Vector Database

Vector Database is a database that are specifically tailord to store and query vector data. 
Vector database natively support similarity search based on a vector as well, with several selection method like taking the top-k.

In [17]:
VECTORDB_DIRECTORY = '../data/chromadb/'
VECTORDB = Chroma(persist_directory=VECTORDB_DIRECTORY, embedding_function=EMBEDDING_MODEL)

In [20]:
# save content into database
# before data could be inserted, we need to convert it into Document object
from langchain_core.documents import Document
lc_documents = []
for c_idx, c in enumerate(contents):
    lc_documents.append(Document(c, metadata={'content_id': c_idx}))

In [16]:
lc_documents

[Document(metadata={'content_id': 0}, page_content='Cuacanya sangat dingin hari ini'),
 Document(metadata={'content_id': 1}, page_content='Buku terbaru yang ada di toko sangat bagus'),
 Document(metadata={'content_id': 2}, page_content='Taman itu berada di daerah Jakarta Kota')]

In [17]:
VECTORDB.add_documents(lc_documents)

['40d79ad2-f0bc-4c86-91a6-97fdea5880aa',
 '6d2b0acd-7215-4a07-bddc-aed1e4f81f72',
 'b491453a-96d7-4b8b-8a42-d95b78f1252f']

In [21]:
# retrieve data based on query
retriever = VECTORDB.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 1, "score_threshold": 0.1})
retriever.invoke(query)

[Document(metadata={'content_id': 2}, page_content='Taman itu berada di daerah Jakarta Kota')]

# RAG

In [3]:
# data loading
import json

with open('../data/rag/data.json', 'r') as f_:
    data = json.load(f_)

In [7]:
from pprint import pprint
pprint(data[0])

{'content': 'KOMPAS.com - Shin Tae-yong telah diberhentikan dari tugas melatih '
            'timnas Indonesia. Kendati demikian, STY tetap ingin memajukan '
            'sepak bola Indonesia melalui akademinya\n'
            '\n'
            'Lembar perjalanan Shin Tae-yong sebagai pelatih timnas Indonesia '
            'resmi berakhir.\n'
            '\n'
            'Ketua Umum PSSI, Erick Thohir, menyampaikan langsung keputusan '
            'pemberhentian STY dalam sebuah sesi konferensi pers di Menara '
            'Danareksa, Jakarta, Senin (6/1/2025)\n'
            '\n'
            'Shin Tae-yong pun mesti mengakhiri pengabdian bersama timnas '
            'Indonesia lebih cepat, meski sejatinya ia masih terikat kontrak '
            'sampai 2027.\n'
            '\n'
            'Baca juga: Shin Tae-yong, Timnas Indonesia, dan Hierarki di '
            'Budaya Korea Selatan\n'
            '\n'
            'Kepergian STY dari kursi pelatih timnas Indonesia sempat '
            '

In [22]:
# rebuild database
VECTORDB.delete_collection()
VECTORDB = Chroma(persist_directory=VECTORDB_DIRECTORY, embedding_function=EMBEDDING_MODEL)

lc_documents = []
for d_idx, d in enumerate(data):
    lc_documents.append(Document(d['content'], metadata={'title': d['title'], 'url': d['url']}))

VECTORDB.add_documents(lc_documents)

['e9e3c89f-0a2c-4373-ad23-deb05a499450', '5354a170-1bd5-405e-a702-0d8a657c1f17', '775b2158-4e20-4a2b-a376-335bf8711b0f', '75655d7a-217a-4766-934d-9d7aef1fe072', '5f51da24-bf38-42bc-b30d-0d6b39ecc28a', 'de501ba9-0387-4c6c-b77c-5667ba36adbb', '95599344-dde5-4a2e-92e3-9460d4094d3e', '4c9e64d0-1246-4edb-8287-b3e04acb2a1b', '5033125a-d94b-4458-905f-8b28d6af8542', 'f75e9fcf-5c09-4275-8436-d76c3156e8b1', '1f0e54b6-bacb-4fdd-9a82-d24d4639c22e', 'dcc14ccc-f31f-44e8-9f83-b411b25a0ba0', 'd9d546a6-11e5-4879-83c5-9d9a176ab204', '9e91b1db-dd29-4578-9501-28ca132a9bea', '226ac700-694f-448d-9877-99359ff2d2bf', '84d4ba0e-f22e-45ea-92b9-f1689412dacb', 'dc2a084e-6d41-4044-aae4-721724dc8aaa', '078eab02-3fa3-4a23-92aa-adfb64be6f9d', 'aa8af347-2c8e-4fdc-b4fd-e5e7760b9ee4', '7eed7d45-527b-4b8f-b640-f01481e8b06b', 'b783125f-8acb-49a2-9231-e14b28e9d5ff']

In [41]:
def find_similar_documents(q: str, k: Optional[int]=1) -> list[str]:
    return map(
        lambda _: _.page_content, # get only page_content from result
        VECTORDB.as_retriever(
            search_type="similarity_score_threshold", 
            search_kwargs={"k": k, "score_threshold": 0.1}
        ).invoke(q)
    )

In [54]:
def combine_query_with_documents(q: str, docs: list[str]) -> str:
    context_str = "\n".join(docs)
    final_q = f"""
Konteks tambahan yang digunakan adalah berikut.
---------------------
{context_str}
---------------------
Dengan mempertimbangkan hanya informasi konteks dan bukan pengetahuan sebelumnya, \
jawab pertanyaan dibawah dengan benar dan tolak jika jawabannya tidak ditemukan \
pada konteks di atas.
Pertanyaan: {q}
    """
    return final_q

In [55]:
q = 'Kenapa STY diganti?'
similar_docs = find_similar_documents(q)
q_with_context = combine_query_with_documents(q, similar_docs)

In [56]:
print(q_with_context)


Konteks tambahan yang digunakan adalah berikut.
---------------------
KOMPAS.com - Shin Tae-yong telah diberhentikan dari tugas melatih timnas Indonesia. Kendati demikian, STY tetap ingin memajukan sepak bola Indonesia melalui akademinya

Lembar perjalanan Shin Tae-yong sebagai pelatih timnas Indonesia resmi berakhir.

Ketua Umum PSSI, Erick Thohir, menyampaikan langsung keputusan pemberhentian STY dalam sebuah sesi konferensi pers di Menara Danareksa, Jakarta, Senin (6/1/2025)

Shin Tae-yong pun mesti mengakhiri pengabdian bersama timnas Indonesia lebih cepat, meski sejatinya ia masih terikat kontrak sampai 2027.

Baca juga: Shin Tae-yong, Timnas Indonesia, dan Hierarki di Budaya Korea Selatan

Kepergian STY dari kursi pelatih timnas Indonesia sempat menimbulkan pertanyaan tentang nasib akademi sang pelatih asal Korea Selatan di Tanah Air.

Seperti diketahui, pada November 2024 silam, Shin TaeYong Football Academy, mulai beroperasi di Indonesia.

View this post on Instagram A post sh

In [57]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
response = llm.invoke(q_with_context)

In [58]:
response.content

'Teks tersebut tidak menjelaskan alasan spesifik mengapa Shin Tae-yong (STY) diganti sebagai pelatih timnas Indonesia.  Hanya disebutkan bahwa keputusannya disampaikan oleh Ketua Umum PSSI, Erick Thohir, dan bahwa STY masih terikat kontrak sampai 2027, namun  pengabdiannya berakhir lebih cepat.\n'

In [63]:
q = 'Kapan Makan Gratis Dimulai?'
similar_docs = find_similar_documents(q)
q_with_context = combine_query_with_documents(q, similar_docs)
response = llm.invoke(q_with_context)
print(response.content)

Makan siang gratis dimulai pada Senin, 6 Januari 2025.

