In [None]:
!pip install transformers torch langchain langchain-openai
!pip install PyMuPDF --quiet


In [2]:
import os
import fitz  # PyMuPDF
from google.colab import files

# Create documents folder if it doesn't exist
os.makedirs("documents", exist_ok=True)

# Upload any number of PDFs or .txt files
uploaded_files = files.upload()

for file_name in uploaded_files.keys():
    ext = file_name.split('.')[-1].lower()
    dest_path = os.path.join("documents", file_name.replace(".pdf", ".txt"))

    if ext == "pdf":
        # Convert PDF to text
        doc = fitz.open(file_name)
        text = ""
        for page in doc:
            text += page.get_text()
        with open(dest_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"Converted {file_name} → {dest_path}")

    elif ext == "txt":
        # Move .txt directly to documents folder
        os.rename(file_name, dest_path)
        print(f"Uploaded {file_name} → {dest_path}")

    else:
        print(f"Skipped {file_name}: unsupported file type")


Saving toddlers_behavior.txt to toddlers_behavior.txt
Uploaded toddlers_behavior.txt → documents/toddlers_behavior.txt


In [3]:
import os
import re
import uuid
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Create folders if they don't exist
os.makedirs("documents", exist_ok=True)
os.makedirs("database", exist_ok=True)


In [4]:
def chunking(directory_path, tokenizer, chunk_size, para_seperator="\n\n", separator=" "):
    documents = {}
    all_chunks = {}
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        print("Processing:", filename)
        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            doc_id = str(uuid.uuid4())
            paragraphs = re.split(para_seperator, text)
            for paragraph in paragraphs:
                words = paragraph.split(separator)
                current_chunk_str = ""
                chunk = []
                for word in words:
                    if current_chunk_str:
                        new_chunk = current_chunk_str + separator + word
                    else:
                        new_chunk = current_chunk_str + word
                    if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                        current_chunk_str = new_chunk
                    else:
                        if current_chunk_str:
                            chunk.append(current_chunk_str)
                        current_chunk_str = word
                if current_chunk_str:
                    chunk.append(current_chunk_str)
                for chunk_text in chunk:
                    chunk_id = str(uuid.uuid4())
                    all_chunks[chunk_id] = {"text": chunk_text, "metadata": {"file_name":sku}}
        documents[doc_id] = all_chunks
    return documents

def map_document_embeddings(documents, tokenizer, model):
    mapped_document_db = {}
    for id, dict_content in documents.items():
        mapped_embeddings = {}
        for content_id, text_content in dict_content.items():
            text = text_content.get("text")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
            mapped_embeddings[content_id] = embeddings
        mapped_document_db[id] = mapped_embeddings
    return mapped_document_db

def compute_embeddings(query, tokenizer, model):
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()
    return query_embeddings.tolist()

def calculate_cosine_similarity_score(query_embeddings, chunk_embeddings):
    normalized_query = np.linalg.norm(query_embeddings)
    normalized_chunk = np.linalg.norm(chunk_embeddings)
    if normalized_chunk == 0 or normalized_query == 0:
        score = 0
    else:
        score = np.dot(chunk_embeddings, query_embeddings)/ (normalized_chunk * normalized_query)
    return score

def retrieve_top_k_scores(query_embeddings, mapped_document_db, top_k):
    scores = {}
    for doc_id, chunk_dict in mapped_document_db.items():
        for chunk_id, chunk_embeddings in chunk_dict.items():
            chunk_embeddings = np.array(chunk_embeddings)
            score = calculate_cosine_similarity_score(query_embeddings, chunk_embeddings)
            scores[(doc_id, chunk_id )] = score
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
    return sorted_scores

def retrieve_top_results(sorted_scores):
    top_results=[]
    for ((doc_id, chunk_id), score) in sorted_scores:
        results = (doc_id, chunk_id, score)
        top_results.append(results)
    return top_results

def save_json(path, data):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def retrieve_text(top_results, document_data):
    first_match = top_results[0]
    doc_id = first_match[0]
    chunk_id = first_match[1]
    related_text = document_data[doc_id][chunk_id]
    return related_text

def generate_llm_response(openai_model, query, relavent_text):
    try:
        template = """
        You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.

        Your job is to understand the request, and answer based on the retrieved context.
        Here is context:

        <context>
        {context}
        </context>

        Question: {question}
        """
        prompt = ChatPromptTemplate.from_template(template=template)
        chain = prompt | openai_model
        response = chain.invoke({"context":relavent_text["text"],"question":query})
        return response
    except Exception as e:
        return f"LLM response skipped due to error: {str(e)}"


In [6]:

directory_path = "documents"
model_name = "BAAI/bge-small-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

chunk_size = 200
para_seperator = "\n\n"
separator = " "
top_k = 2


api_key_exists = "OPENAI_API_KEY" in os.environ
if api_key_exists:
    openai_model = ChatOpenAI(model="gpt-3.5-turbo")
else:
    openai_model = None
    print("⚠️ OpenAI API key not found. LLM responses will be skipped.")


documents = chunking(directory_path, tokenizer, chunk_size, para_seperator, separator)

mapped_document_db = map_document_embeddings(documents, tokenizer, model)


save_json('database/doc_store.json', documents)
save_json('database/vector_store.json', mapped_document_db)

document_data = read_json("database/doc_store.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing: Black and White Simple Infographic Resume-2 (3).txt
Processing: toddlers_behavior.txt
Processing: Black and White Simple Infographic Resume-2 (5).txt
Processing: Black and White Simple Infographic Resume-2 (4).txt
Processing: Black and White Simple Infographic Resume-2 (2).txt
Processing: Black and White Simple Infographic Resume-2 (1).txt
Processing: Black and White Simple Infographic Resume-2.txt


In [7]:
!pip install gradio --quiet
import gradio as gr

def answer_query(query):
    query_embeddings = compute_embeddings(query, tokenizer, model)
    sorted_scores = retrieve_top_k_scores(query_embeddings, mapped_document_db, top_k)
    top_results = retrieve_top_results(sorted_scores)
    relavent_text = retrieve_text(top_results, document_data)

    result = f"🔎 Retrieved Text:\n{relavent_text['text']}"


    if openai_model:
        llm_response = generate_llm_response(openai_model, query, relavent_text)
        result += f"\n\n🤖 LLM Response:\n{llm_response}"

    return result

interface = gr.Interface(
    fn=answer_query,
    inputs=gr.Textbox(lines=2, placeholder="Type your question here..."),
    outputs=gr.Textbox(lines=15, label="Retrieved & Optional LLM Answer"),  # bigger box & custom label
    title="Document Q&A Assistant",  # Neutral title
    description="Ask anything about your uploaded documents. Works for any topic."
)

interface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7ae1af39385a426eab.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


