In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Required Installations

In [None]:
!pip install sentence-transformers transformers scikit-learn --quiet


Load & Preprocess .txt Files

In [1]:
import os

# Define the path to your folder
folder_path = '/content/drive/My Drive/q&a_task'

all_files = []
for root, dirs, files in os.walk(folder_path):
    for file in files:
        full_path = os.path.join(root, file)
        all_files.append(full_path)

# Step 4: Print the list of all files
print(f"Found {len(all_files)} files.")
for f in all_files:
    print(f)


Found 12 files.
/content/drive/My Drive/q&a_task/api.txt
/content/drive/My Drive/q&a_task/authentication.txt
/content/drive/My Drive/q&a_task/configuration.txt
/content/drive/My Drive/q&a_task/error_codes.txt
/content/drive/My Drive/q&a_task/export_logs.txt
/content/drive/My Drive/q&a_task/faq.txt
/content/drive/My Drive/q&a_task/installation.txt
/content/drive/My Drive/q&a_task/intro.txt
/content/drive/My Drive/q&a_task/reset_password.txt
/content/drive/My Drive/q&a_task/usage.txt
/content/drive/My Drive/q&a_task/user_roles.txt
/content/drive/My Drive/q&a_task/webhooks.txt


In [2]:
import os

# Load all text files from the specified directory
def load_text_files(directory):
    text_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text_data.append(file.read())
    return text_data

directory = '/content/drive/My Drive/q&a_task'
texts = load_text_files(directory)
texts


['The HTTP API supports three endpoints:\nPOST /api/index   → build or rebuild the index\nPOST /api/query   → payload: { question, top_k }\nGET  /api/status  → returns { ready: true } when the index is ready\n',
 'Before running, set your API keys as environment variables.\nUse OPENAI_API_KEY for OpenAI models and LLAMA_API_KEY for hosted Llama endpoints.\nTo use a local embedding model, adjust settings in config/settings.py.\nNever commit your API keys to version control.\n',
 'All settings live in config/settings.py:\nEMBEDDING_MODEL, LLM_MODEL, VECTOR_STORE, CHUNK_SIZE, TOP_K.\nTo override defaults, export the corresponding environment variables.\nRestart the application after making any changes.\n',
 '1001: Configuration file not found\n1002: Missing API key\n2001: Vector store unreachable\n3001: LLM generation timed out\n',
 'To export past queries to CSV, run:\n  python export_logs.py --output logs.csv\nThe CSV includes timestamp, question, answer, and sources.\nCustomize fields 

In [3]:
import re

# Preprocess text by removing unnecessary whitespace, special characters, and normalizing line breaks
def preprocess_text(text):
    # Remove leading/trailing spaces
    text = text.strip()

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Optional: Remove non-text characters (e.g., special symbols or punctuation) if needed
    text = re.sub(r'[^\w\s]', '', text)  # Uncomment if you want to remove punctuation

    # Normalize line breaks (convert any \n or \r\n to a single space)
    text = re.sub(r'[\n\r]+', ' ', text)

    return text

# Load and preprocess all text files in the directory
def load_and_preprocess_text_files(directory):
    text_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                raw_text = file.read()
                cleaned_text = preprocess_text(raw_text)
                text_data.append(cleaned_text)
    return text_data

# Load and preprocess the text files
directory = '/content/drive/My Drive/q&a_task' # replace with your directory path
texts = load_and_preprocess_text_files(directory)

# Check the cleaned data (first 5 texts as example)
for i, text in enumerate(texts):
    print(f"Cleaned Text {i+1}: {text[:300]}...")  # Display first 300 characters of each cleaned file


Cleaned Text 1: The HTTP API supports three endpoints POST apiindex  build or rebuild the index POST apiquery  payload  question top_k  GET apistatus  returns  ready true  when the index is ready...
Cleaned Text 2: Before running set your API keys as environment variables Use OPENAI_API_KEY for OpenAI models and LLAMA_API_KEY for hosted Llama endpoints To use a local embedding model adjust settings in configsettingspy Never commit your API keys to version control...
Cleaned Text 3: All settings live in configsettingspy EMBEDDING_MODEL LLM_MODEL VECTOR_STORE CHUNK_SIZE TOP_K To override defaults export the corresponding environment variables Restart the application after making any changes...
Cleaned Text 4: 1001 Configuration file not found 1002 Missing API key 2001 Vector store unreachable 3001 LLM generation timed out...
Cleaned Text 5: To export past queries to CSV run python export_logspy output logscsv The CSV includes timestamp question answer and sources Customize fields by edit

2. Split Texts into Chunks

Chunks are created with the minimum word count found in any file. This ensures even small files get their own chunk.

In [4]:
def calculate_min_file_word_count(texts):
    # Calculate the word count for each file and return the minimum word count
    min_word_count = float('inf')  # Start with a very large number

    for text in texts:
        word_count = len(text.split())
        if word_count < min_word_count:
            min_word_count = word_count

    return min_word_count

def split_text_into_chunks_based_on_min_file(texts):
    # Calculate the minimum word count based on the smallest file
    min_words_per_chunk = calculate_min_file_word_count(texts)
    print(f"Minimum Words per Chunk: {min_words_per_chunk}")  # Debugging line to check min count

    chunks = []
    current_chunk = []

    for text in texts:
        words = text.split()
        for word in words:
            current_chunk.append(word)

            # When current chunk reaches the minimum word count, finalize the chunk and start a new one
            if len(current_chunk) >= min_words_per_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []  # Start a new chunk

    return chunks

# Apply the chunking logic based on the smallest file word count
chunks = split_text_into_chunks_based_on_min_file(texts)

# Check the result
print(f"Total Chunks: {len(chunks)}")
for idx, chunk in enumerate(chunks):  # Display the first 5 chunks for inspection
    print(f"Chunk {idx+1}: {chunk[:300]}...")  # Display the first 300 characters of each chunk


Minimum Words per Chunk: 18
Total Chunks: 22
Chunk 1: The HTTP API supports three endpoints POST apiindex build or rebuild the index POST apiquery payload question top_k...
Chunk 2: GET apistatus returns ready true when the index is ready Before running set your API keys as environment...
Chunk 3: variables Use OPENAI_API_KEY for OpenAI models and LLAMA_API_KEY for hosted Llama endpoints To use a local embedding model...
Chunk 4: adjust settings in configsettingspy Never commit your API keys to version control All settings live in configsettingspy EMBEDDING_MODEL...
Chunk 5: LLM_MODEL VECTOR_STORE CHUNK_SIZE TOP_K To override defaults export the corresponding environment variables Restart the application after making any...
Chunk 6: changes 1001 Configuration file not found 1002 Missing API key 2001 Vector store unreachable 3001 LLM generation timed...
Chunk 7: out To export past queries to CSV run python export_logspy output logscsv The CSV includes timestamp question answer...
Chunk 

3. Generate Embeddings with Sentence-BERT

Use the all-MiniLM-L6-v2 model — small and fast.

In [24]:
from sentence_transformers import SentenceTransformer

# Load the Sentence-BERT model
e_model = SentenceTransformer('all-MiniLM-L6-v2')

# Check if the model is loaded properly
print("Sentence-BERT Model Loaded!")


Sentence-BERT Model Loaded!


In [25]:
import numpy as np

embeddings = e_model.encode(chunks, convert_to_tensor=True)


print(f"Generated Embeddings: {embeddings.shape}")


Generated Embeddings: torch.Size([22, 384])


4. Retrieve Relevant Chunks with Cosine Similarity

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_top_k_chunks_cosine(query, model, embeddings, chunks, k=3):
    query_embedding = model.encode([query], convert_to_tensor=True)
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    top_k_indices = similarities.argsort()[-k:][::-1]
    top_chunks = [chunks[i] for i in top_k_indices]
    return top_chunks, similarities[top_k_indices]


5. Generate Answer with Falcon-1B(LLM)

---



Falcon is light and works on free Colab without needing an API.

In [32]:
def generate_answer_directly(query, top_chunks):
    context = "\n\n".join(top_chunks)
    prompt = f"""Use the following context to answer the question:

Context:
{context}

Question: {query}
Answer:"""
    result = generator(prompt)[0]["generated_text"]
    return result.split("Answer:")[-1].strip()


Final Integration: Query, Retrieve, and Generate Answer

In [34]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "tiiuae/falcon-rw-1b"  # Light model, free and usable on CPU

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = 0 if torch.cuda.is_available() else -1

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device,
    max_new_tokens=256,
    temperature=0.7,
    do_sample=True,
)

def answer_question(query):
    context = "\n\n".join(retrieve_chunks(query))
    prompt = f"""
Use the following context to answer the question:

Context:
{context}

Question: {query}
Answer:"""
    result = generator(prompt)[0]["generated_text"]
    return result.split("Answer:")[-1].strip()


KeyboardInterrupt: 

In [38]:
query = "How do I reset my password?"
top_chunks, _ = retrieve_top_k_chunks_cosine(query, e_model, embeddings, chunks, k=2)

print("Most relevant answer:", top_chunks[0])
print("Second relevant answer:", top_chunks[1])

answer = generate_answer_directly(query, top_chunks)

print("🔍 Question:", query)
print("🧠 Answer:", answer)



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Most relevant answer: To reset your password open Settings and choose Security Click Forgot Password and enter your registered email address
Second relevant answer: Check your inbox for the reset link and follow the instructions If no email arrives within five minutes
🔍 Question: How do I reset my password?
🧠 Answer: Changing your password will ensure you are the only person who can access your account, and will help improve your security..
This post originally appeared on the author’s blog
