## Set the device to GPU

In [1]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load multiple documents

In [2]:
import os
from langchain.document_loaders import PyPDFLoader

def load_documents():
    documents = []
    files = ['docs/linkedin_profile.pdf', 'docs/biography.pdf']
    
    for file in files:
        if os.path.exists(file):
            loader = PyPDFLoader(file)
            documents.extend(loader.load())
        else:
            print(f'Warning: {file} not found, skipping...')
    return documents

## Chunking documents for better retrieval

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000, chunk_overlap=200
    )
    return text_splitter.split_documents(documents)

#  Generate Embeddings and Create FAISS Index

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def create_faiss_index(docs, embedding_model='all-MiniLM-L6-v2'):
    try:
        print('create_faiss_index: downloading embedding model if not available...')
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        vector_db = FAISS.from_documents(docs, embeddings)
        vector_db.save_local('vector-store/tiny_llama_faiss_index')
        
        print('FAISS index created successfully.')
        return vector_db
    except Exception as e:
        print(f'unable to create FAISS index: {e}')
        return None

## Load TinyLlama 1.1B (Q4_K_M)

In [5]:
from llama_cpp import Llama

def load_llm(model_name='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf'):
    model_path = os.path.join('models', model_name)
    if not os.path.exists(model_path):
        return None
    
    try:
        return Llama(model_path=model_path, n_gpu_layers=30, n_threads=8, n_ctx=1024)
    except Exception as e:
        print(f'Error loading TinyLlama model: {e}')
        return None

## Implement Retrieval-Augmented Generation (RAG) with improved prompting

In [6]:
def rag_pipeline(query, vector_db, llm):
    if vector_db is None:
        return 'No vector database available.'
    
    retrieved_docs = vector_db.similarity_search(query, k=4)
    source_documents = [doc.metadata for doc in retrieved_docs]
    context = '\n'.join([doc.page_content for doc in retrieved_docs])
    
    prompt = f'''
    You are an AI assistant answering questions about Kaung SiThu. 
    Be concise and informative. If unsure, say you don't know.
    
    Context:
    {context}
    
    User Question: {query}
    '''
    
    response = llm(prompt)
    return {
        'answer': response['choices'][0]['text'].strip(),
        'sources': source_documents
    }

## Evaluate retrieval and generation models

In [7]:
def evaluate_models():
    print('evaluate_model: evaluating retriever model (FAISS)...')
    try:
        faiss_index = FAISS.load_local(
            'vector-store/tiny_llama_faiss_index', 
            HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2'), 
            allow_dangerous_deserialization=True
            )
        print(f'evaluate_model: FAISS index successfully loaded and functional.\n {faiss_index}')
    except Exception as e:
        print(f'evaluate_model: failed to load FAISS index: {e}')
    
    print('evaluate_model: evaluating generator model (TinyLLaMA)...')
    try:
        llm = load_llm()    
        response = llm('evaluate_model: test query: What is machine learning?')
        print(f'evaluate_model: {response}.')
    except Exception as e:
        print(f'evaluate_model: error with TinyLLaMA model inference: {e}')

## Executing the pipeline

In [8]:
print('loading documents')
documents = load_documents()
docs = chunk_documents(documents)
docs

loading documents


[Document(metadata={'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="Contact\n+959779056197 (Mobile)\nneucleyon@gmail.com\nwww.linkedin.com/in/kaung-\nsithu-634ab2160 (LinkedIn)\nwww.facebook.com/archx64/\n(Personal)\nTop Skills\nData Science\nArtificial Intelligence (AI)\nNeural Networks\nCertifications\nMachine Learning by Stanford\nUniversity & DeepLearning.AI on\nCoursera\nSpring Framework\nSupervised Machine Learning:\nRegression and Classification \nAdvanced Learning Algorithms\nUnsupervised Learning,\nRecommenders, Reinforcement\nLearning\nKaung SiThu\nData-Driven Problem Solver\nYangon, Myanmar\nSummary\nI'm just an ordinary guy who is enthusiastic on technology and\nscience\nExperience\nEngineerforce\nPython Developer\nMay 2023\xa0-\xa0Nove

In [9]:
print(f'size of documents: {len(docs)}')

size of documents: 2


In [10]:
print('Creating FAISS index...')
vector_db = create_faiss_index(docs)
vector_db

Creating FAISS index...
create_faiss_index: downloading embedding model if not available...
FAISS index created successfully.


<langchain_community.vectorstores.faiss.FAISS at 0x24ef623c6e0>

In [11]:
llm = load_llm()
evaluate_models()

llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from models\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 64
llama_model_loader: - kv   7:              

evaluate_model: evaluating retriever model (FAISS)...


llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from models\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 64
llama_model_loader: - kv   7:              

evaluate_model: FAISS index successfully loaded and functional.
 <langchain_community.vectorstores.faiss.FAISS object at 0x0000024EC80E9DF0>
evaluate_model: evaluating generator model (TinyLLaMA)...


load_tensors:   CPU_Mapped model buffer size =   636.18 MiB
....................................................................................
llama_init_from_model: n_seq_max     = 1
llama_init_from_model: n_ctx         = 1024
llama_init_from_model: n_ctx_per_seq = 1024
llama_init_from_model: n_batch       = 512
llama_init_from_model: n_ubatch      = 512
llama_init_from_model: flash_attn    = 0
llama_init_from_model: freq_base     = 10000.0
llama_init_from_model: freq_scale    = 1
llama_init_from_model: n_ctx_per_seq (1024) < n_ctx_train (2048) -- the full capacity of the model will not be utilized
llama_kv_cache_init: kv_size = 1024, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 22, can_shift = 1
llama_kv_cache_init: layer 0: n_embd_k_gqa = 256, n_embd_v_gqa = 256
llama_kv_cache_init: layer 1: n_embd_k_gqa = 256, n_embd_v_gqa = 256
llama_kv_cache_init: layer 2: n_embd_k_gqa = 256, n_embd_v_gqa = 256
llama_kv_cache_init: layer 3: n_embd_k_gqa = 256, n_embd_v_gqa = 256
llama

evaluate_model: {'id': 'cmpl-2b2f18b0-29cf-4c86-b7d5-ec53e3a59f27', 'object': 'text_completion', 'created': 1741886691, 'model': 'models\\tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', 'choices': [{'text': ' How does it work? Why is it important? Answer: Machine learning is a', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 13, 'completion_tokens': 16, 'total_tokens': 29}}.


In [12]:
questions = [   
    'How old is Kaung Sithu?',
    'What is Kaung Sithu\'s highest level of education?',
    'What major or field of study did Kaung pursue during your education?',
    'How many years of work experience does Kaung have in software development?',
    'What type of work or industry has Kaung been involved in?',
    'Can you describe Kaung\'s current role or job responsibilities?',
    'What are Kaung\'s core beliefs regarding the role of technology in shaping society?',
    'How does Kaung think cultural values should influence technological advancements?',
    'As a master’s student, what is the most challenging aspect of his studies so far?',
    'What specific research interests or academic goals does Kaung hope to achieve during your time as a master’s student?'
    ]
answers = []
for q in questions:
    result = rag_pipeline(q, vector_db, llm)
    answers.append({'question': q, 'answer': result['answer'], 'sources': result['sources']})
    print(f'Q: {q}\nResponse: {result['answer']}\nSources: {result['sources']}\n')

llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =    8177.68 ms /   842 tokens (    9.71 ms per token,   102.96 tokens per second)
llama_perf_context_print:        eval time =     440.25 ms /    14 runs   (   31.45 ms per token,    31.80 tokens per second)
llama_perf_context_print:       total time =    8626.46 ms /   856 tokens
Llama.generate: 832 prefix-match hit, remaining 15 prompt tokens to eval


Q: How old is Kaung Sithu?
Response: Reply: Kaung Sithu is 25 years old.
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     179.08 ms /    15 tokens (   11.94 ms per token,    83.76 tokens per second)
llama_perf_context_print:        eval time =     476.11 ms /    15 runs   (   31.74 ms per token,    31.51 tokens per second)
llama_perf_context_print:       total time =     664.39 ms /    30 tokens
Llama.generate: 833 prefix-match hit, remaining 16 prompt tokens to eval


Q: What is Kaung Sithu's highest level of education?
Response: AI Assistant:
     Yes, Kaung Sithu holds a Bach
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     195.73 ms /    16 tokens (   12.23 ms per token,    81.74 tokens per second)
llama_perf_context_print:        eval time =     466.67 ms /    15 runs   (   31.11 ms per token,    32.14 tokens per second)
llama_perf_context_print:       total time =     671.19 ms /    31 tokens
Llama.generate: 832 prefix-match hit, remaining 16 prompt tokens to eval


Q: What major or field of study did Kaung pursue during your education?
Response: Answer:

     My interest in technology and science led me to pursue
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     192.69 ms /    16 tokens (   12.04 ms per token,    83.03 tokens per second)
llama_perf_context_print:        eval time =     460.73 ms /    15 runs   (   30.72 ms per token,    32.56 tokens per second)
llama_perf_context_print:       total time =     662.42 ms /    31 tokens
Llama.generate: 832 prefix-match hit, remaining 15 prompt tokens to eval


Q: How many years of work experience does Kaung have in software development?
Response: Answer: Yes, Kaung has over five years of experience in software development.
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     182.51 ms /    15 tokens (   12.17 ms per token,    82.19 tokens per second)
llama_perf_context_print:        eval time =     456.31 ms /    15 runs   (   30.42 ms per token,    32.87 tokens per second)
llama_perf_context_print:       total time =     647.61 ms /    30 tokens
Llama.generate: 832 prefix-match hit, remaining 17 prompt tokens to eval


Q: What type of work or industry has Kaung been involved in?
Response: Answer: Kaung has been involved in software development, specifically in the Python programming
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     200.77 ms /    17 tokens (   11.81 ms per token,    84.67 tokens per second)
llama_perf_context_print:        eval time =     462.05 ms /    15 runs   (   30.80 ms per token,    32.46 tokens per second)
llama_perf_context_print:       total time =     672.42 ms /    32 tokens
Llama.generate: 832 prefix-match hit, remaining 21 prompt tokens to eval


Q: Can you describe Kaung's current role or job responsibilities?
Response: Answer:
     Kaung SiThu is currently a Senior Devlopr
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     252.50 ms /    21 tokens (   12.02 ms per token,    83.17 tokens per second)
llama_perf_context_print:        eval time =     459.19 ms /    15 runs   (   30.61 ms per token,    32.67 tokens per second)
llama_perf_context_print:       total time =     721.02 ms /    36 tokens
Llama.generate: 832 prefix-match hit, remaining 16 prompt tokens to eval


Q: What are Kaung's core beliefs regarding the role of technology in shaping society?
Response: User Answer: Kaung's core beliefs regarding the role of technology in
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     196.63 ms /    16 tokens (   12.29 ms per token,    81.37 tokens per second)
llama_perf_context_print:        eval time =     460.67 ms /    15 runs   (   30.71 ms per token,    32.56 tokens per second)
llama_perf_context_print:       total time =     666.21 ms /    31 tokens
Llama.generate: 832 prefix-match hit, remaining 24 prompt tokens to eval


Q: How does Kaung think cultural values should influence technological advancements?
Response: Solution:
     Kaung believes that cultural values play a crucial role
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     282.15 ms /    24 tokens (   11.76 ms per token,    85.06 tokens per second)
llama_perf_context_print:        eval time =     462.24 ms /    15 runs   (   30.82 ms per token,    32.45 tokens per second)
llama_perf_context_print:       total time =     753.17 ms /    39 tokens
Llama.generate: 832 prefix-match hit, remaining 25 prompt tokens to eval


Q: As a master’s student, what is the most challenging aspect of his studies so far?
Response: Reply: The most challenging aspect of my Master’s studies is the
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]



llama_perf_context_print:        load time =    8178.16 ms
llama_perf_context_print: prompt eval time =     332.44 ms /    25 tokens (   13.30 ms per token,    75.20 tokens per second)
llama_perf_context_print:        eval time =     465.07 ms /    15 runs   (   31.00 ms per token,    32.25 tokens per second)
llama_perf_context_print:       total time =     807.24 ms /    40 tokens


Q: What specific research interests or academic goals does Kaung hope to achieve during your time as a master’s student?
Response: Answer: 
     I hope to achieve academic excellence by acquiring a
Sources: [{'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-03-12T23:10:36+07:00', 'author': 'Kaung Sithu', 'moddate': '2025-03-12T23:10:36+07:00', 'source': 'docs/biography.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, {'producer': 'Apache FOP Version 2.2', 'creator': 'PyPDF', 'creationdate': '2025-03-10T15:50:07+00:00', 'title': 'Resume', 'author': 'LinkedIn', 'subject': 'Resume generated from profile', 'source': 'docs/linkedin_profile.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}]

