# Getting started with LLMs and RAG

Note: First create a filtered dataset with `filter-dataset.ipynb`

In [None]:
import sys
import os
import pandas as pd
import json
from dotenv import load_dotenv
import tiktoken
import pickle

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.rate_limiters import InMemoryRateLimiter

from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.llm import get_azure_embeddings_client, get_llm_client, get_gemini_llm_client

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place valid keys in the .env file.')

In [None]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
FILTERED_METADATA_PATH = os.path.join("..", "data", "filtered_metadata.csv")
DB_PATH = os.path.join("..", "data", "db", "sample.db")

if not os.path.exists(DB_PATH):
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)

In [None]:
filtered_metadata = pd.read_csv(FILTERED_METADATA_PATH)
filtered_metadata.head(5)

Create simple vector database

In [None]:
def get_documents_from_path(filenames: list[str]) -> [Document]:
    documents = []
    
    for file_name in filenames:
        file_path = os.path.join(ARTICLES_CLEAN_DIR, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            file = json.load(file)

        text = file.get("text", "")
        documents.append(Document(page_content=text, metadata={
            "title": file.get("title", ""),
            "author": file.get("author", ""),
            "published_at": file.get("published_at", ""),
            "id": file.get("id", ""),
        }))

    return documents

In [None]:
documents = get_documents_from_path(filtered_metadata["filename"])
print(f"Number of articles: {len(documents)}")

In [None]:
# Create database
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, separators=["\n\n", "\n"])

# Split documents and create vector database
texts = text_splitter.split_documents(documents)

In [None]:
embeddings = get_azure_embeddings_client(
    chunk_size=512, # number of documents' chunks processed in parallel, decrease if you hit rate limits
    show_progress_bar=True,
)

In [None]:
db = FAISS.from_documents(texts, embeddings)

In [None]:
# Count build embedding token number
tokenizer = tiktoken.get_encoding("cl100k_base")
build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
print(f"Token count: {build_token_count}")

In [None]:
# Store the database
with open(DB_PATH, "wb") as f:
    pickle.dump(db.serialize_to_bytes(), f)

Create simple RAG

In [None]:
# CHECKPOINT: Load vector DB
with open(DB_PATH, "rb") as f:
    serialized_data = pickle.load(f)

# Reconstruct the FAISS database
db = FAISS.deserialize_from_bytes(serialized_data, embeddings, allow_dangerous_deserialization=True)

In [None]:
# FYI: free tier Gemini LLM 
# rate_limiter = InMemoryRateLimiter(
#     requests_per_second=0.5,  # <-- Gemini Free Tier
#     check_every_n_seconds=0.1,
# )

# llm = get_gemini_llm_client(
#     max_tokens=1024,
#     temperature=0.2,
#     rate_limiter=rate_limiter,
# )

# Default go-to Openrouter LLM - check README for other available models
llm = get_llm_client(
    # Configurable parameters
    max_tokens=1024,
    temperature=0.2,
)

In [None]:
system_prompt = """
You are an expert assistant. Use only the following retrieved context to answer the question accurately and concisely. 
If nothing is mentioned in the context, say "I don't know".
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [None]:
def ask_question(query):
    response = retrieval_chain.invoke({"query": query})
    print(f"Question: {query}\nAnswer: {response['result']}")
    print("\nSources: \n")
    for source in response["source_documents"]:
        print(source.metadata)
    return response

In [None]:
response = ask_question("What are the current economic threats in Austria?")