# 📂 IRS Folder RAG Pipeline with Sentence Chunking + Gradio UI

In [None]:
# # ✅ Fix for Google Colab to avoid punkt_tab errors
# !pip install --upgrade --force-reinstall nltk

# import nltk
# nltk.download("punkt", download_dir="/content/nltk_data")
# nltk.data.path.append("/content/nltk_data")

# # Sanity check
# from nltk.tokenize import sent_tokenize
# print(sent_tokenize("This is a test. Here's another sentence."))


In [2]:
# If running in Colab, uncomment:
!pip install PyMuPDF tqdm sentence-transformers faiss-cpu pandas transformers gradio nltk

Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading

In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import re
import fitz
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize

pdf_folder = "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/Rag model data/irs_eng_pdfs"

In [7]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    return text.replace("\n", " ").strip()

def split_list(input_list: list, slice_size: int) -> list:
    """Splits a list into chunks of a given size."""
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

num_sentence_chunk_size = 10
pages_and_chunks = []

for filename in tqdm(os.listdir(pdf_folder)):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, filename)
        file_type = (
            "form" if filename.lower().startswith("f") else
            "instruction" if filename.lower().startswith("i") else
            "publication" if filename.lower().startswith("p") else
            "unknown"
        )
        try:
            doc = fitz.open(file_path)
            for page_number, page in enumerate(doc):
                raw_text = page.get_text()
                formatted_text = text_formatter(raw_text)
                sentences = sent_tokenize(formatted_text)
                sentence_chunks = split_list(sentences, num_sentence_chunk_size)
                for chunk in sentence_chunks:
                    joined_chunk = " ".join(chunk).replace("  ", " ").strip()
                    joined_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_chunk)
                    pages_and_chunks.append({
                        "file": filename,
                        "file_type": file_type,
                        "page_number": page_number + 1,
                        "sentence_chunk": joined_chunk,
                        "chunk_char_count": len(joined_chunk),
                        "chunk_word_count": len(joined_chunk.split(" ")),
                        "chunk_token_count": len(joined_chunk) / 4
                    })
        except Exception as e:
            print(f"Error reading {filename}: {e}")

df = pd.DataFrame(pages_and_chunks)
df.to_csv("/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/sentence_chunks_df.csv", index=False)
df.head()


  0%|          | 0/2214 [00:00<?, ?it/s]

MuPDF error: library error: FT_New_Memory_Face(USYDFC+SourceHanSansSC-Bold): invalid argument



Unnamed: 0,file,file_type,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,p5633.pdf,publication,1,OVER-THE-PHONE INTERPRETER (OPI) Languages Rep...,1039,126,259.75
1,f14824.pdf,form,1,Form 14824 (Rev. 10-2022) Catalog Number 69954...,2258,401,564.5
2,f14824.pdf,form,1,"• In either case, to show where you lived, you...",1956,332,489.0
3,f14824.pdf,form,2,Form 14824 (Rev. 10-2022) Catalog Number 69954...,501,81,125.25
4,i109495c.pdf,instruction,1,2024 Instructions for Forms 1094-C and 1095-C ...,1094,149,273.5


# **Embedding our text chunks**

In [9]:
from sentence_transformers import SentenceTransformer
import torch

In [10]:
# Check for GPU and set appropriate precision
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32  # Use FP16 on GPU

# Load the optimized Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",
                            device=device,
                            trust_remote_code=True)

# Increase batch size for efficient GPU utilization
BATCH_SIZE = 128 if device == "cuda" else 32

# Generate embeddings in batches
text_chunks = df["sentence_chunk"].tolist()
embeddings = model.encode(text_chunks,
                          batch_size=BATCH_SIZE,
                          convert_to_tensor=True,
                          device=device)  # Keeps data on GPU for faster computation

# Convert embeddings to NumPy array (if needed)
df["embedding"] = [embedding.cpu().numpy() for embedding in embeddings]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
import faiss

# Convert embeddings to NumPy array
embedding_matrix = np.array(df["embedding"].tolist()).astype('float32')

# Create FAISS index (for L2/Euclidean search)
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

# Save FAISS index
faiss.write_index(index, "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/faiss_index.bin")

df.to_csv("/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/sentence_chunks_df.csv", index=False)

print("✅ Processing Completed! FAISS Index & CSV Saved.")


✅ Processing Completed! FAISS Index & CSV Saved.


In [12]:
# 🔎 **Function for Similarity Search**
def search_similar_text(query_text, top_k=3):
    query_embedding = model.encode([query_text], convert_to_tensor=True).cpu().numpy()
    D, I = index.search(query_embedding, k=top_k)
    similar_chunks = df.iloc[I[0]]["sentence_chunk"].tolist()
    return similar_chunks

# ✅ Example Usage
query = "How do I file my tax returns?"
print("🔎 Similar Chunks Found:\n", search_similar_text(query))

🔎 Similar Chunks Found:
 ['Preparing and filing your tax return. After receiving all your wage and earnings state- ments (Forms W-2, W-2G, 1099-R, 1099-MISC, 1099-NEC, etc. ); unemployment compensation statements (by mail or in a digital format) or other government payment statements (Form 1099-G); and interest, dividend, and retirement statements from banks and investment firms (Forms 1099), you have several options to choose from to prepare and file your tax return. You can prepare the tax return yourself, see if you qualify for free tax preparation, or hire a tax professional to prepare your return. Free options for tax preparation. Go to IRS.gov to see your options for preparing and filing your return online or in your local commun- ity, if you qualify, which include the following. • Direct File. Direct File is a permanent op- tion to file individual federal tax returns on- line—for free—directly and securely with the iRS. Direct File is an option for taxpay- ers in participating s

In [None]:
query_text = "How do I file my tax returns?"
query_embedding = model.encode([query_text]).astype('float32')

# Search in FAISS
D, I = index.search(query_embedding, k=3)  # k=3 means top 3 results

# Retrieve matching text chunks
similar_chunks = df.iloc[I[0]]["sentence_chunk"].tolist()
print(similar_chunks)


In [None]:
from sentence_transformers import SentenceTransformer
import faiss

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["sentence_chunk"].tolist(), show_progress_bar=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))


In [None]:
def retrieve(query, top_k=5, file_type_filter=None):
    query_vec = model.encode([query])
    D, I = index.search(np.array(query_vec), top_k * 2)
    results = df.iloc[I[0]]
    if file_type_filter:
        results = results[results["file_type"] == file_type_filter]
    return results.head(top_k)

In [None]:
from transformers import pipeline
qa_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1", device_map="auto")

def generate_answer(prompt):
    result = qa_model(prompt, max_new_tokens=256, do_sample=True)
    return result[0]["generated_text"]


In [None]:
query = "What are the tax filing deadlines for 2024?"
results = retrieve(query, top_k=5)
context = "\n\n".join(results["sentence_chunk"].tolist())
prompt = f"Answer the question based on the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
response = generate_answer(prompt)
print(response)


In [None]:
import gradio as gr

def rag_chat_interface(query):
    results = retrieve(query, top_k=5)
    context = "\n\n".join(results["sentence_chunk"].tolist())
    prompt = f"Answer the question based on the context below.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    response = generate_answer(prompt)
    return response

gr.Interface(fn=rag_chat_interface, inputs="text", outputs="text", title="IRS Tax Assistant (Folder Mode)").launch()
