# Multilingual RAG + LLM Integration with Gemini API

## 1. Setup and Installation

In [31]:
# Install required packages
!pip install google-generativeai langchain langchain-google-genai chromadb sentence-transformers python-dotenv langdetect

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (pyproject.toml) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993332 sha256=a18dc31cd16914e5797dfbb45e6d9585cd5967f3664efbc7e84ecd64f20a6718
  Stored in directory: /home/duyle/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [32]:
import os
import json
import pickle
from typing import List, Dict, Any
from pathlib import Path
import langdetect
# RAG components
import chromadb
from sentence_transformers import SentenceTransformer

# Langchain components
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

# Environment variables
from dotenv import load_dotenv
load_dotenv('/home/duyle/Documents/Case-Study/.env')

True

## 2. API Key Setup

In [19]:
load_dotenv('/home/duyle/Documents/Case-Study/.env')

True

In [20]:
# Set up your Gemini API key
# Option 1: Set in environment variable
# os.environ["GOOGLE_API_KEY"] = "your-api-key-here"

# Option 2: Load from .env file (recommended)
# Create a .env file with: GOOGLE_API_KEY=your-api-key-here
api_key = os.getenv("GOOGLE-API-KEY")


## 3. Load Existing RAG Data

In [21]:
with open('data/processed/chunked_documents.pkl', 'rb') as f:
    chunked_documents = pickle.load(f)


## 4. Initialize Vector Store

In [None]:
embedding_model = SentenceTransformer("intfloat/multilingual-e5-large")


In [None]:
# Langchain wrapper 
embeddings = SentenceTransformerEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    model_kwargs={})

In [None]:
persist_directory = "./chroma_db_langchain_e5"

if os.path.exists(persist_directory) and os.listdir(persist_directory):
    vectorstore = Chroma( persist_directory=persist_directory, embedding_function=embeddings)
else:

    texts = [doc.page_content for doc in chunked_documents]
    metadatas = [doc.metadata for doc in chunked_documents]
    # Add task prefixes
    texts_with_prefix = [f"passage: {text}" for text in texts]
    
    vectorstore = Chroma.from_texts(
        texts=texts_with_prefix,
        metadatas=metadatas,
        embedding=embeddings,
        persist_directory=persist_directory
    )



‚úÖ Created vector store with 257 documents


## 5. Initialize Gemini LLM

In [25]:
# Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.1,
    convert_system_message_to_human=True,
    google_api_key = api_key)

print("‚úÖ Gemini LLM initialized")

‚úÖ Gemini LLM initialized


## 6. Create Custom RAG Chain

In [None]:
def detect_language(text):
    """Enhanced language detection for mixed Vietnamese-English queries"""
    
    from langdetect import detect
    detected = detect(text)
    
    vietnamese_chars = set('√†√°·∫°·∫£√£√¢·∫ß·∫•·∫≠·∫©·∫´ƒÉ·∫±·∫Ø·∫∑·∫≥·∫µ√®√©·∫π·∫ª·∫Ω√™·ªÅ·∫ø·ªá·ªÉ·ªÖ√¨√≠·ªã·ªâƒ©√≤√≥·ªç·ªè√µ√¥·ªì·ªë·ªô·ªï·ªó∆°·ªù·ªõ·ª£·ªü·ª°√π√∫·ª•·ªß≈©∆∞·ª´·ª©·ª±·ª≠·ªØ·ª≥√Ω·ªµ·ª∑·ªπƒë')
    has_vietnamese = any(char in vietnamese_chars for char in text.lower())
    
    if has_vietnamese or detected == 'vi':
        return 'vi'
    else:
        return 'en'


In [None]:

def get_language_specific_prompt(language):
    """Get prompt template based on language"""
    if language == 'vi':
        vietnamese_template = """B·∫°n l√† tr·ª£ l√Ω AI chuy√™n v·ªÅ APEC 2025 Korea.

        H∆∞·ªõng d·∫´n:
        - S·ª≠ d·ª•ng th√¥ng tin ƒë∆∞·ª£c cung c·∫•p ƒë·ªÉ tr·∫£ l·ªùi c√¢u h·ªèi m·ªôt c√°ch ch√≠nh x√°c
        - Khi tr√¨nh b√†y d·ªØ li·ªáu b·∫£ng, ƒë·ªãnh d·∫°ng r√µ r√†ng v√† d·ªÖ ƒë·ªçc
        - Bao g·ªìm c√°c chi ti·∫øt c·ª• th·ªÉ nh∆∞ ng√†y th√°ng, ƒë·ªãa ƒëi·ªÉm v√† t√™n s·ª± ki·ªán
        - N·∫øu ng·ªØ c·∫£nh ch·ª©a nhi·ªÅu m·ª•c li√™n quan, h√£y li·ªát k√™ t·∫•t c·∫£
        - N·∫øu kh√¥ng t√¨m th·∫•y c√¢u tr·∫£ l·ªùi trong ng·ªØ c·∫£nh, h√£y n√≥i r√µ r√†ng
        - Tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát

        Ng·ªØ c·∫£nh:
        {context}

        C√¢u h·ªèi: {question}

        Tr·∫£ l·ªùi:"""
        
        return PromptTemplate(
            template=vietnamese_template,
            input_variables=["context", "question"]
        )
    else:
        english_template = """You are an expert AI assistant specializing in APEC 2025 Korea information.

        Instructions:
        - Use the provided context to answer questions accurately
        - When presenting table data, format it clearly and readably
        - Include specific details like dates, venues, and event names
        - If the context contains multiple relevant items, list them all
        - If you cannot find the answer in the context, say so clearly
        - Maintain accuracy of all factual information

        Context:
        {context}

        Question: {question}

        Answer:"""
        
        return PromptTemplate(
            template=english_template,
            input_variables=["context", "question"]
        )



In [37]:
# Create the RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 5, 
        }
    ),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

print("‚úÖ RAG chain created successfully")

‚úÖ RAG chain created successfully


## 7. Enhanced RAG Functions

In [None]:
def enhanced_rag_query(question, show_sources= True):
    
    # 1. Detect language 
    detected_language = detect_language(question)
    
    # 2. Get language-specific prompt
    language_prompt = get_language_specific_prompt(detected_language)
    
    # 3. Create temporary chain with language-specific prompt
    temp_qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}
        ),
        chain_type_kwargs={"prompt": language_prompt},
        return_source_documents=True
    )
    
    search_question = f"query: {question}"
    
    result = temp_qa_chain({"query": search_question})
    
    sources = []
    for doc in result["source_documents"]:
        source_info = {
            "title": doc.metadata.get("title", "Unknown"),
            "url": doc.metadata.get("url", ""),
            "contains_table": doc.metadata.get("contains_table", False),
            "chunk_length": doc.metadata.get("chunk_length", 0)
        }
        sources.append(source_info)
    
    response = {
        "answer": result["result"],
        "sources": sources,
        "num_sources": len(sources),
        "detected_language": detected_language
    }
    
    if show_sources:
        lang_flag = "vi" if detected_language == 'vi' else "en"
        print(f"{lang_flag} Answer based on {len(sources)} sources:")
        print(f"Tables included: {sum(1 for s in sources if s['contains_table'])}")
        print("\n" + "="*60)
        print(response["answer"])
        print("\n" + "="*60)
        print("Sources:")
        for i, source in enumerate(sources, 1):
            table_indicator = "Table used" if source["contains_table"] else "No table used"
            print(f"{i}. {table_indicator} {source['title']}")
    
    return response

## 8. Test the RAG System

In [None]:
# Test 1: General question
print("Test 1: General APEC question")
response1 = enhanced_rag_query("APEC l√† g√¨")

Test 1: General APEC question




vi Answer based on 5 sources:
Tables included: 0

APEC (Asia-Pacific Economic Cooperation) l√† m·ªôt di·ªÖn ƒë√†n kinh t·∫ø khu v·ª±c ƒë∆∞·ª£c th√†nh l·∫≠p v√†o nƒÉm 1989 ƒë·ªÉ t·∫≠n d·ª•ng s·ª± ph·ª• thu·ªôc l·∫´n nhau ng√†y c√†ng tƒÉng c·ªßa khu v·ª±c Ch√¢u √Å - Th√°i B√¨nh D∆∞∆°ng. APEC c√≥ 21 th√†nh vi√™n, v·ªõi m·ª•c ti√™u t·∫°o ra s·ª± th·ªãnh v∆∞·ª£ng l·ªõn h∆°n cho ng∆∞·ªùi d√¢n trong khu v·ª±c b·∫±ng c√°ch th√∫c ƒë·∫©y tƒÉng tr∆∞·ªüng c√¢n b·∫±ng, to√†n di·ªán, b·ªÅn v·ªØng, ƒë·ªïi m·ªõi v√† an to√†n, ƒë·ªìng th·ªùi ƒë·∫©y nhanh h·ªôi nh·∫≠p kinh t·∫ø khu v·ª±c. APEC ƒë·∫£m b·∫£o h√†ng h√≥a, d·ªãch v·ª•, ƒë·∫ßu t∆∞ v√† con ng∆∞·ªùi di chuy·ªÉn tr∆°n tru qua bi√™n gi·ªõi. C√°c th√†nh vi√™n t·∫°o ƒëi·ªÅu ki·ªán thu·∫≠n l·ª£i cho th∆∞∆°ng m·∫°i th√¥ng qua vi·ªác h·ª£p l√Ω h√≥a c√°c th·ªß t·ª•c h·∫£i quan t·∫°i bi√™n gi·ªõi; th√∫c ƒë·∫©y m√¥i tr∆∞·ªùng kinh doanh thu·∫≠n l·ª£i h∆°n; v√† ƒëi·ªÅu ph·ªëi c√°c quy ƒë·ªãnh v√† ti√™u chu·∫©n trong khu v·ª±c. ƒê√¢y l√† di·ªÖn ƒë√†n li√™n

In [None]:
response1 = enhanced_rag_query("L·ªãch meetings APEC 2025")



vi Answer based on 5 sources:
Tables included: 3

D∆∞·ªõi ƒë√¢y l√† l·ªãch c√°c cu·ªôc h·ªçp APEC 2025:

| No. | T√™n s·ª± ki·ªán                                                                 | Ng√†y                | ƒê·ªãa ƒëi·ªÉm           |
|-----|------------------------------------------------------------------------------|---------------------|--------------------|
| 1   | Informal Senior Officials‚Äô Meeting (ISOM)                                  | 9 - 11 th√°ng 12, 2024 | Seoul              |
| 2   | 1st APEC Business Advisory Council Meeting (ABAC)                           | 23 ‚Äì 25 th√°ng 2, 2025 | Brisbane, Australia |
| 3   | First Senior Officials‚Äô Meeting and Related Meetings (SOM1)                  | 24 th√°ng 2 - 9 th√°ng 3, 2025 | Gyeongju           |
| 4   | Finance and Central Bank Deputies‚Äô Meeting (FCBDM)                          | 6 - 7 th√°ng 3, 2025   | Gyeongju           |
| 5   | 2nd APEC Business Advisory Council Meeting (ABAC)                      

In [None]:
print("\nüîç Test 2: Table-specific question")
response2 = enhanced_rag_query("Cho t√¥i bi·∫øt v·ªÅ c√°c cu·ªôc h·ªçp APEC ƒë∆∞·ª£c x·∫øp v√†o th√°ng 5 2025")


üîç Test 2: Table-specific question




vi Answer based on 5 sources:
Tables included: 2

D∆∞·ªõi ƒë√¢y l√† danh s√°ch c√°c cu·ªôc h·ªçp APEC di·ªÖn ra v√†o th√°ng 5 nƒÉm 2025:

| No. | T√™n s·ª± ki·ªán                                                      | Ng√†y          | ƒê·ªãa ƒëi·ªÉm |
|-----|-------------------------------------------------------------------|---------------|---------|
| 7   | Second Senior Officials‚Äô Meeting and Related Meetings (SOM2)       | 3 - 16/5/2025 | Jeju    |
| 8   | Human Resource Development Ministerial Meeting (HRDMM)            | 11 - 13/5/2025| Jeju    |
| 9   | APEC Education Ministerial Meeting(AEMM)                          | 13 - 15/5/2025| Jeju    |
| 10  | Ministers Responsible for Trade (MRT)                             | 15 - 16/5/2025| Jeju    |

Ngo√†i ra, c√≥ m·ªôt s·ªë s·ª± ki·ªán b√™n l·ªÅ di·ªÖn ra v√†o th√°ng 5 nƒÉm 2025:

| No. | T√™n s·ª± ki·ªán                                            | Ng√†y       | ƒê·ªãa ƒëi·ªÉm |
|-----|------------------------------------------

In [None]:
print("\nTest 3: Venue-specific question")
response3 = enhanced_rag_query("C√≥ cu·ªôc h·ªçp n√†o v√†o th√°ng 5 at JeJu khong")


Test 3: Venue-specific question




vi Answer based on 5 sources:
Tables included: 1

C√≥, c√≥ nhi·ªÅu cu·ªôc h·ªçp di·ªÖn ra t·∫°i Jeju v√†o th√°ng 5 trong khu√¥n kh·ªï APEC 2025 Korea. D∆∞·ªõi ƒë√¢y l√† danh s√°ch c√°c s·ª± ki·ªán ƒë∆∞·ª£c li·ªát k√™:

*   **International Forum on Disability Employment:** 6 th√°ng 5, 2025
*   **APEC Future Education Forum (AFEF):** 6 th√°ng 5, 2025
*   **HRDMM : Policy Experience Booths:** 6 - 12 th√°ng 5, 2025
*   **APEC Sustainable Social Entrepreneurship Training (ASSET):** 10 th√°ng 5, 2025
*   **Global Education Reform Conference:** 13 th√°ng 5, 2025
*   **APEC University Leader‚Äôs Forum (AULF):** 13 th√°ng 5, 2025
*   **HRDMM : Field Trip:** 13 th√°ng 5, 2025
*   **Educational Innovation Achievement Sharing Exhibition Booth:** 13 - 14 th√°ng 5, 2025
*   **APEC Second Senior Officials‚Äô Meeting (SOM2):** 3 - 16 th√°ng 5, 2025
*   **School Visit and Field Trip:** 15 th√°ng 5, 2025
*   **Bilateral talks with trade representatives:** 14-16 th√°ng 5, 2025

Sources:
1. Table used Sid