In [5]:
from google import genai
from pydantic import BaseModel
import os
import json

from dotenv import load_dotenv

load_dotenv()

class City(BaseModel):
    capital: str
    area: float

llm = genai.Client(
    api_key = os.getenv("GEMINI_API_KEY"),

)

response = llm.models.generate_content(
    model="gemini-2.5-flash",
    contents="Apa ibukota saat ini di Indonesia dan luasnya?",
    config={
        "response_mime_type": "application/json",
        "response_schema": City
    }
)

print(response.text)
print("The capital of Indonesia now is " + json.loads(response.text)['capital'] + " with area " + str(json.loads(response.text)['area']))

{"capital": "Jakarta", "area": 1904569}
The capital of Indonesia now is Jakarta with area 1904569


In [6]:
from markitdown import MarkItDown
from chonkie import RecursiveChunker, Visualizer

md = MarkItDown()

source_file = "sample pdf.pdf"
result = md.convert(source_file)

markdown = result.text_content

chunker = RecursiveChunker.from_recipe("markdown", lang="en")
chunks = chunker.chunk(markdown)

viz = Visualizer()
viz.save("chonkie.html", chunks)

  from .autonotebook import tqdm as notebook_tqdm


HTML visualization saved to: file://d:\DOWNLOAD\training_rag\chonkie.html


In [10]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100,
    length_function = len,
    separators = ["\n\n","\n"," ", ""]
)

load_docs = PyPDFLoader("sample pdf.pdf")
pages = load_docs.load()

chunks = text_splitter.split_documents(pages)

print(len(pages))
print(len(chunks))
print(chunks[0].page_content)
print(chunks[0].metadata)

3
11
THE HIDDEN DANGERS OF ALCOHOL INTOXICATION 
Understanding the Short-Term and Long-Term Risks to Your Health and Safety 
1. Introduction 
While alcohol is socially accepted in many cultures, "getting drunk" (intoxication) 
places the body under immense stress. Alcohol is a central nervous system depressant 
that rapidly impairs brain function, physical coordination, and judgment. This document 
outlines the critical dangers associated with excessive alcohol consumption, ranging
{'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2025-11-17T00:17:12+07:00', 'author': 'Arvin Melvillo', 'moddate': '2025-11-17T00:17:12+07:00', 'title': 'Microsoft Word - Document1', 'source': 'sample pdf.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}


In [12]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

try:
    client = chromadb.PersistentClient('db/workshop_1')
    client.delete_collection(name="gemini_demo")
except:
    pass

google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=os.getenv("GEMINI_API_KEY"),
    model_name="gemini-embedding-001"
)

client = chromadb.PersistentClient('db/workshop_1')

collection = client.create_collection(
    name = "gemini_demo",
    embedding_function = google_ef
)

collection.add(
    ids = [f"chunk_{i}" for i, _ in enumerate(chunks)],
    documents = [doc.page_content for doc in chunks],
    metadatas = [doc.metadata for doc in chunks]
)

In [21]:
collection = client.get_collection("gemini_demo", embedding_function=google_ef)

query = "Di binus ada makanan apa aja?"

results = collection.query(
    query_texts = query,
    n_results = 5
)

print(results['documents'][0][4])

 Financial Ruin: The cost of alcohol, combined with potential job loss due to 
poor performance or legal fees (DUI), can lead to bankruptcy. 
 Legal Consequences: Arrests for public intoxication, disorderly conduct, or 
driving under the inﬂuence (DUI) leave permanent criminal records. 
 
5. Summary of Statistics 
 Fatalities: Alcohol contributes to more than 3 million deaths globally each year 
(WHO). 
 Youth Risk: Alcohol is a leading factor in death for people aged 15–49.


In [22]:
context = "\n".join(results['documents'][0])

prompt = f"""Use the following context to answer the question. If you cannot answer the question based on the context, say "I cannot answer this based on the provided context."

Context:
{context}

Question: {query}"""

llm = genai.Client(
    api_key = os.getenv("GEMINI_API_KEY")
)

response = llm.models.generate_content(
    model = "gemini-2.5-flash",
    contents = prompt,
    config = {
        "system_instruction": "You are helpful assistant that answer the question based on provided context",
        "temperature":"0" #0 - 1
    }
)

print(response.text)

I cannot answer this based on the provided context.
