### Install required libraries

In [None]:
!pip install -qqq ipykernel==6.29.5 --progress-bar off
!pip install -qqq pandas==2.2.3 --progress-bar off
!pip install -qqq pdfminer --progress-bar off            
!pip install -qqq pdfminer.six --progress-bar off       
!pip install -qqq sqlite-vec==0.1.1 --progress-bar off   
!pip install -qqq fastembed==0.3.4 --progress-bar off    
!pip install -qqq rank-bm25==0.2.2 --progress-bar off  

Import libraries

In [None]:
import json
import os
import pprint
import requests
import sqlite3
import sqlite_vec
from fastembed import TextEmbedding
from pdfminer.high_level import extract_text
from rank_bm25 import BM25Okapi
import re
import numpy as np

Define constants and global variables

In [None]:
DB_NAME = "metu_academic"
TABLE_NAME = "metu_academic_rules"
ORIG_DOCS_FOLDER = "./docs"
TEXT_DOCS_FOLDER = "./docs_text"

all_paragraphs = {}

Create SQLite database file

In [None]:
db = sqlite3.connect(DB_NAME + ".db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

Create document metadata and embeddings tables

In [None]:
# drop metadata table if exists before
db.execute(f"DROP TABLE IF EXISTS {TABLE_NAME};")

# create metadata table
db.execute(f"""
        CREATE TABLE {TABLE_NAME} (
            id INTEGER PRIMARY KEY,
            text TEXT NOT NULL
        );
    """)

# drop embedding table if exists before
db.execute(f"DROP TABLE IF EXISTS document_embeddings;")

# create embedding table, default vector length for fastembed is 384
db.execute(
    f"""
        CREATE VIRTUAL TABLE document_embeddings USING vec0(
        id INTEGER PRIMARY KEY,
        embedding FLOAT[384]
        );
    """
)

Convert not processed PDF documents to .txt documents

In [None]:
def pdf_to_text(pdf_path, txt_path):
    text = extract_text(pdf_path)
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)

not_processed_files = [fname for fname in os.listdir(f"./{ORIG_DOCS_FOLDER}") if f"{fname}.txt" not in os.listdir(f"./{TEXT_DOCS_FOLDER}")]
print("These files will be processed:")
pprint.pprint(not_processed_files)

for fname in not_processed_files:
    pdf_to_text(f"./{ORIG_DOCS_FOLDER}/{fname}", f"./{TEXT_DOCS_FOLDER}/{fname}.txt")

Add new paragraphs and their embeddings to the database

In [None]:
# read not processed files
new_paragraph_ids = []
last_paragraph_id = len(all_paragraphs.keys())

print("Last paragraph ID:", last_paragraph_id)

for fname in not_processed_files:
    text_filename = f"{fname}.txt"
    print(f"Processing: {text_filename}")

    content_file = open(f"./{TEXT_DOCS_FOLDER}/{text_filename}", "r")
    content_lines = content_file.readlines()
    full_content = "".join(content_lines)

    """
    For this demo, paragraphs are important for us.
    Paragraphs are identified by double newline characters.
    The assumptions for paragraphs are: they should have length more than 100 characters and they should contain dot character.
    """
    new_paragraphs_text = [text_block for text_block in full_content.split("\n\n") if text_block.strip() != "" and len(text_block.split(" ")) > 1 and "." in text_block and len(text_block) > 100]

    for i, paragraph in enumerate(new_paragraphs_text):
        orig_fname = text_filename[0 : -4] # remove .txt at the end
        new_paragraph_id = last_paragraph_id + i + 1
        all_paragraphs[new_paragraph_id] = (orig_fname, paragraph)
        new_paragraph_ids.append(new_paragraph_id)
    
    last_paragraph_id = len(all_paragraphs.keys())
    print("Last paragraph ID:", last_paragraph_id)


# add new paragraphs to the metadata table
for new_paragraph_id in new_paragraph_ids:
    db.execute(f"""INSERT INTO {TABLE_NAME}(id, text) VALUES (?, ?)""", [new_paragraph_id, all_paragraphs[new_paragraph_id][1]])

# add new paragraph embeddings to the database
model = TextEmbedding()

# convert to embeddings
document_embeddings = list(model.embed([all_paragraphs[new_paragraph_id][1] for new_paragraph_id in new_paragraph_ids]))

for i in range(len(new_paragraph_ids)):
    db.execute(f"""INSERT INTO document_embeddings(id, embedding) VALUES (?, ?)""", [new_paragraph_ids[i], sqlite_vec.serialize_float32(document_embeddings[i])])

In [None]:
document_embeddings[0].shape

In [None]:
def tokenize(text):
    return re.findall(r"[\\w']+", text.lower())

# Build BM25 corpus (list of tokenized docs)
corpus = [tokenize(p[1]) for p in all_paragraphs.values()]
bm25 = BM25Okapi(corpus)
doc_ids = list(all_paragraphs.keys())  # maintain mapping

In [None]:
# --- Retrieve related paragraphs using BM25 ---
def get_bm25_paragraphs(query, top_k=5):
    tokenized_query = tokenize(query)
    scores = bm25.get_scores(tokenized_query)

    # sort docs by score
    top_indices = np.argsort(scores)[::-1][:top_k]

    related_paragraphs = [all_paragraphs[doc_ids[i]][1] for i in top_indices]
    return related_paragraphs

Search similar documents and generate enriched LLM prompt

In [None]:
def get_enriched_llm_prompt(basic_prompt, number_of_examples, rag_option="both"): # "both", "bm25", "vector"  
    query_embedding = list(model.embed(basic_prompt))[0]

    results = db.execute(
        f"""
        SELECT
            document_embeddings.id,
            distance,
            {TABLE_NAME}.text
        FROM document_embeddings
        LEFT JOIN {TABLE_NAME} ON {TABLE_NAME}.id = document_embeddings.id
        WHERE document_embeddings.embedding MATCH ?
            AND k = ?
        ORDER BY distance
        """,
        [sqlite_vec.serialize_float32(query_embedding), number_of_examples],
    ).fetchall()

    related_paragraphs = [item[2] for item in results]
    bm25_paragraphs = get_bm25_paragraphs(basic_prompt, top_k=number_of_examples)
    unique_paragraphs = list(set(related_paragraphs + bm25_paragraphs))

    rag_prompt = basic_prompt + "\n" + "\n"
    rag_prompt += "Example information that can be used while answering the question:\n"
    if rag_option == "both":
        rag_prompt += "\n".join(["=> " + p for p in unique_paragraphs])
    elif rag_option == "bm25":
        rag_prompt += "\n".join(["=> " + p for p in bm25_paragraphs])
    elif rag_option == "vector":
        rag_prompt += "\n".join(["=> " + p for p in related_paragraphs])
    return rag_prompt

LLM inference with enriched prompt

In [None]:
def call_inference_endpoint(url, model_id, prompt):
    try:
        # Text Generation
        print(f"Sending request to {url}")
        response = requests.post(
            url,
            json={
                "prompt": "Answer the following question according to the paragraphs provided to you.\n" + prompt,
                "model_id": model_id,
                "engine": "huggingface",
            }
        )
        response.raise_for_status()
        result = response.json()
        print(f"Response received:")
        pprint.pprint(result['response'])
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
    except json.JSONDecodeError as e:
        print(f"Error decoding response: {e}")
        print(f"Raw response: {result['response']}")

Number of RAG examples

In [None]:
NUM_OF_EXAMPLES = 5

Model selection

In [None]:
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"

Sample questions

In [None]:
sample_questions = [
    "i am a graduate student and i got an incomplete from one of my courses. when will the final letter grade announced?",
    "i haven't paid my dormitory payment, can i still apply for being an exchange student in northern cyprus?",
    "i would like to apply for a graduate program in metu, what are the requirements for candidates?",
    "when is the last time for finding a thesis supervisor for a master's program?",
    "what is the lowest grade for scholarship cancellation?",
]

Prepare user prompt

In [None]:
url = "http://<your host URL>:8080/api/inference"

user_prompt = sample_questions[4]


print("Zero shot prompt:")
print("---")
pprint.pprint(user_prompt)

rag_prompt = get_enriched_llm_prompt(basic_prompt=user_prompt, number_of_examples=NUM_OF_EXAMPLES, 
                                     rag_option="both")

print("Enriched prompt:")
print("--")
pprint.pprint(rag_prompt)

Run inference with enriched prompt

In [None]:
call_inference_endpoint(url, model_id=MODEL_ID, prompt=rag_prompt)

In [None]:
rag_prompt = get_enriched_llm_prompt(basic_prompt=user_prompt, number_of_examples=NUM_OF_EXAMPLES, 
                                     rag_option="vector")

print("Enriched prompt:")
print("--")
pprint.pprint(rag_prompt)

call_inference_endpoint(url, model_id=MODEL_ID, prompt=rag_prompt)

In [None]:
rag_prompt = get_enriched_llm_prompt(basic_prompt=user_prompt, number_of_examples=NUM_OF_EXAMPLES, 
                                     rag_option="bm25")

print("Enriched prompt:")
print("--")
pprint.pprint(rag_prompt)

call_inference_endpoint(url, model_id=MODEL_ID, prompt=rag_prompt)