### Install required libraries

In [None]:
!pip install -qqq ipykernel==6.29.5 --progress-bar off
!pip install -qqq pandas==2.2.3 --progress-bar off
!pip install -qqq pdfminer --progress-bar off            # convert pdf to text
!pip install -qqq pdfminer.six --progress-bar off        # convert pdf to text
!pip install -qqq sqlite-vec==0.1.1 --progress-bar off   # vector db
!pip install -qqq fastembed==0.3.4 --progress-bar off    # embeddings

Import libraries

In [None]:
import json
import os
import pprint
import requests
import sqlite3
import sqlite_vec
from fastembed import TextEmbedding
from pdfminer.high_level import extract_text


Constants

In [None]:
DB_NAME = "metu_academic"
TABLE_NAME = "metu_academic_rules"
ORIG_DOCS_FOLDER = "./docs"
TEXT_DOCS_FOLDER = "./docs_text"

Convert PDF documents to .txt documents

In [None]:
def pdf_to_text(pdf_path, txt_path):
    text = extract_text(pdf_path)
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        txt_file.write(text)


for fname in os.listdir(f"./{ORIG_DOCS_FOLDER}"):
    pdf_to_text(f"./{ORIG_DOCS_FOLDER}/{fname}", f"./{TEXT_DOCS_FOLDER}/{fname}.txt")

Create SQLite database file

In [None]:
db = sqlite3.connect(DB_NAME + ".db")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

Add metadata table to the database

In [None]:
# drop metadata table if exists before
db.execute(f"DROP TABLE IF EXISTS {TABLE_NAME};")

# create metadata table
db.execute(f"""
        CREATE TABLE {TABLE_NAME} (
            id INTEGER PRIMARY KEY,
            text TEXT NOT NULL
        );
    """)


all_paragraphs = {}
filenames = []
paragraph_texts = []

for text_filename in os.listdir(f"./{TEXT_DOCS_FOLDER}"):
    print(f"Processing: {text_filename}")

    content_file = open(f"./{TEXT_DOCS_FOLDER}/{text_filename}", "r")
    content_lines = content_file.readlines()
    full_content = "".join(content_lines)
    paragraphs = [text_block for text_block in full_content.split("\n\n") if text_block.strip() != "" and len(text_block.split(" ")) > 1 and "." in text_block and len(text_block) > 100]

    for i, paragraph in enumerate(paragraphs):
        orig_fname = text_filename[0 : -4] # remove .txt at the end
        all_paragraphs[i+1] = (orig_fname, paragraph)
        filenames.append(orig_fname)
        paragraph_texts.append(paragraph)


for i in range(len(paragraph_texts)):
    db.execute(f"""INSERT INTO {TABLE_NAME}(id, text) VALUES (?, ?)""", [i+1, paragraph_texts[i]])

Add embeddings table to the database

In [None]:
model = TextEmbedding()

In [None]:
# convert to embeddings
document_embeddings = list(model.embed(paragraph_texts))

# drop embedding table if exists before
db.execute(f"DROP TABLE IF EXISTS document_embeddings;")

db.execute(
    f"""
        CREATE VIRTUAL TABLE document_embeddings USING vec0(
        id INTEGER PRIMARY KEY,
        embedding FLOAT[{len(document_embeddings[0])}]
        );
    """
)

for i in range(len(paragraph_texts)):
    db.execute(f"""INSERT INTO document_embeddings(id, embedding) VALUES (?, ?)""", [i+1, sqlite_vec.serialize_float32(document_embeddings[i])])

Sample questions

In [None]:
sample_questions = [
    "i am a graduate student and i got an incomplete from one of my courses. when will the final letter grade announced?",
    "i haven't paid my dormitory payment, can i still apply for being an exchange student in northern cyprus?",
    "i would like to apply for a graduate program in metu, what are the requirements for candidates?",
    "when is the last time for finding a thesis supervisor for a master's program?",
    "what is the lowest grade for scholarship cancellation?",
]

Prepare user prompt

In [None]:
user_prompt = sample_questions[4]

print("Zero shot prompt:")
print("---")
print(user_prompt)

Number of RAG examples

In [None]:
NUM_OF_EXAMPLES = 5

Search similar documents and generate enriched LLM prompt

In [None]:
query_embedding = list(model.embed(user_prompt))[0]

results = db.execute(
    f"""
    SELECT
        document_embeddings.id,
        distance,
        {TABLE_NAME}.text
    FROM document_embeddings
    LEFT JOIN {TABLE_NAME} ON {TABLE_NAME}.id = document_embeddings.id
    WHERE document_embeddings.embedding MATCH ?
        AND k = ?
    ORDER BY distance
    """,
    [sqlite_vec.serialize_float32(query_embedding), NUM_OF_EXAMPLES],
).fetchall()

related_paragraphs = [item[2] for item in results]
prompt = user_prompt + "\n" + "\n"
prompt += "Example information that can be used while answering the question:\n"
prompt += "\n".join(["=> " + p for p in related_paragraphs])

print(f"""Generated prompt:
---
{"Answer the following question according to the paragraphs provided to you.\n" + prompt}
""")


LLM inference with enriched prompt

In [None]:
def call_inference_endpoint(url, prompt):
    try:
        # Text Generation
        print(f"Sending request to {url}")
        response = requests.post(
            url,
            json={
                "prompt": "Answer the following question according to the paragraphs provided to you.\n" + prompt,
                "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct",
                "engine": "huggingface",
            }
        )
        response.raise_for_status()
        result = response.json()
        print(f"Response received: {result['response']}")
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
    except json.JSONDecodeError as e:
        print(f"Error decoding response: {e}")
        print(f"Raw response: {result['response']}")

In [None]:
url = "http://<your host URL>:8080/api/inference"

call_inference_endpoint(url, prompt="who are you?")