In [1]:
!pip install PyMuPDF tqdm sentence-transformers faiss-cpu pandas transformers nltk
!pip install -U accelerate bitsandbytes transformers

Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import re
import fitz
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize

pdf_folder = "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/Rag model data/irs_eng_pdfs"

In [5]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    return text.replace("\n", " ").strip()

def split_list(input_list: list, slice_size: int) -> list:
    """Splits a list into chunks of a given size."""
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

num_sentence_chunk_size = 10
pages_and_chunks = []

for filename in tqdm(os.listdir(pdf_folder)):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, filename)
        file_type = (
            "form" if filename.lower().startswith("f") else
            "instruction" if filename.lower().startswith("i") else
            "publication" if filename.lower().startswith("p") else
            "unknown"
        )
        try:
            doc = fitz.open(file_path)
            for page_number, page in enumerate(doc):
                raw_text = page.get_text()
                formatted_text = text_formatter(raw_text)
                sentences = sent_tokenize(formatted_text)
                sentence_chunks = split_list(sentences, num_sentence_chunk_size)
                for chunk in sentence_chunks:
                    joined_chunk = " ".join(chunk).replace("  ", " ").strip()
                    joined_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_chunk)
                    pages_and_chunks.append({
                        "file": filename,
                        "file_type": file_type,
                        "page_number": page_number + 1,
                        "sentence_chunk": joined_chunk,
                        "chunk_char_count": len(joined_chunk),
                        "chunk_word_count": len(joined_chunk.split(" ")),
                        "chunk_token_count": len(joined_chunk) / 4
                    })
        except Exception as e:
            print(f"Error reading {filename}: {e}")

df = pd.DataFrame(pages_and_chunks)

df.head()


  0%|          | 0/2214 [00:00<?, ?it/s]

MuPDF error: library error: FT_New_Memory_Face(USYDFC+SourceHanSansSC-Bold): invalid argument



Unnamed: 0,file,file_type,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,p5633.pdf,publication,1,OVER-THE-PHONE INTERPRETER (OPI) Languages Rep...,1039,126,259.75
1,f14824.pdf,form,1,Form 14824 (Rev. 10-2022) Catalog Number 69954...,2258,401,564.5
2,f14824.pdf,form,1,"• In either case, to show where you lived, you...",1956,332,489.0
3,f14824.pdf,form,2,Form 14824 (Rev. 10-2022) Catalog Number 69954...,501,81,125.25
4,i109495c.pdf,instruction,1,2024 Instructions for Forms 1094-C and 1095-C ...,1094,149,273.5


In [6]:
csv_save_path = "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/RAGTaxAssistant/sentence_chunks_df.csv"
df.to_csv(csv_save_path, index=False)

In [7]:
# Loading saved csv
csv_save_path = "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/RAGTaxAssistant/sentence_chunks_df.csv"
df = pd.read_csv(csv_save_path)
df.head()

Unnamed: 0,file,file_type,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,p5633.pdf,publication,1,OVER-THE-PHONE INTERPRETER (OPI) Languages Rep...,1039,126,259.75
1,f14824.pdf,form,1,Form 14824 (Rev. 10-2022) Catalog Number 69954...,2258,401,564.5
2,f14824.pdf,form,1,"• In either case, to show where you lived, you...",1956,332,489.0
3,f14824.pdf,form,2,Form 14824 (Rev. 10-2022) Catalog Number 69954...,501,81,125.25
4,i109495c.pdf,instruction,1,2024 Instructions for Forms 1094-C and 1095-C ...,1094,149,273.5


In [8]:
from sentence_transformers import SentenceTransformer
import torch

In [9]:
# Check for GPU and set appropriate precision
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32  # Use FP16 on GPU

# Load a Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",
                            device=device,
                            trust_remote_code=True)

# Increase batch size for efficient GPU utilization
BATCH_SIZE = 128 if device == "cuda" else 32

# Generate embeddings in batches
text_chunks = df["sentence_chunk"].tolist()
embeddings = model.encode(text_chunks,
                          batch_size=BATCH_SIZE,
                          convert_to_tensor=True,
                          device=device)  # Keeps data on GPU for faster computation

# Convert embeddings to NumPy array (if needed)
df["embedding"] = [embedding.cpu().numpy() for embedding in embeddings]


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
print("device: ", device)
print("dtype: ", dtype)
print("batch size: ", BATCH_SIZE)

device:  cuda
dtype:  torch.float16
batch size:  128


In [11]:
import faiss

# Convert embeddings to NumPy array
embedding_matrix = np.array(df["embedding"].tolist()).astype('float32')

# Create FAISS index (for L2/Euclidean search)
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

# Save FAISS index
faiss_index_save_path = "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/RAGTaxAssistant/faiss_index.bin"
faiss.write_index(index, faiss_index_save_path)

df.to_csv(csv_save_path, index=False)

print("Processing Completed! FAISS Index & CSV Saved.")

Processing Completed! FAISS Index & CSV Saved.


In [12]:
import faiss

# Load the FAISS index
faiss_index_save_path = "/content/drive/MyDrive/UTD Coursework/Sem 4/AI Agents project/Tax project/RAGTaxAssistant/faiss_index.bin"
index = faiss.read_index(faiss_index_save_path)

In [13]:
# Function for Similarity Search
def search_similar_text(query_text, top_k=3):
    query_embedding = model.encode([query_text], convert_to_tensor=True).cpu().numpy()
    D, I = index.search(query_embedding, k=top_k)
    similar_chunks = df.iloc[I[0]]["sentence_chunk"].tolist()
    return similar_chunks

# Example Usage
query = "How do I file my tax returns?"
print("🔎 Similar Chunks Found:\n", search_similar_text(query))

🔎 Similar Chunks Found:
 ['Preparing and filing your tax return. After receiving all your wage and earnings state- ments (Forms W-2, W-2G, 1099-R, 1099-MISC, 1099-NEC, etc. ); unemployment compensation statements (by mail or in a digital format) or other government payment statements (Form 1099-G); and interest, dividend, and retirement statements from banks and investment firms (Forms 1099), you have several options to choose from to prepare and file your tax return. You can prepare the tax return yourself, see if you qualify for free tax preparation, or hire a tax professional to prepare your return. Free options for tax preparation. Go to IRS.gov to see your options for preparing and filing your return online or in your local commun- ity, if you qualify, which include the following. • Direct File. Direct File is a permanent op- tion to file individual federal tax returns on- line—for free—directly and securely with the iRS. Direct File is an option for taxpay- ers in participating s

In [14]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 40 GB


In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
use_quantization_config = False
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)

llm_model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.float16,
                                             quantization_config=quantization_config if use_quantization_config else None,
                                             low_cpu_mem_usage=False,
                                             attn_implementation=attn_implementation)
if not use_quantization_config:
    llm_model.to("cuda")

[INFO] Using attention implementation: sdpa


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [27]:
def augment_prompt(query_text, context_list):
  context = "- " + "\n- ".join([item for item in context_list])

  base_prompt = """Based on the following context items, please answer the query.
  Give yourself room to think by extracting relevant passages from the context before answering the query.
  Don't return the thinking, only return the answer.
  Make sure your answers are as explanatory as possible.
  \nNow use the following context items to answer the user query:
  {context}
  \nRelevant passages: <extract relevant passages from the context here>
  User query: {query}
  Answer:"""


  base_prompt = base_prompt.format(context=context, query=query_text)
  dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]
  prompt = tokenizer.apply_chat_template(dialogue_template, tokenize=False, add_generation_prompt=True)
  return prompt

In [28]:
input_text = "Who should file form 943?"

similar_chunks = search_similar_text(input_text, 5)
augmented_prompt = augment_prompt(input_text, similar_chunks)

In [29]:
print(augmented_prompt)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Based on the following context items, please answer the query.
  Give yourself room to think by extracting relevant passages from the context before answering the query.
  Don't return the thinking, only return the answer.
  Make sure your answers are as explanatory as possible.
  
Now use the following context items to answer the user query:
  - . . . . 4 Who Must File Form 943? . . . . .
- . . . . 3 Who Must File Form 8962 . . . . . .
- . . 6 Where Should You File Form 943-X? . . . . . . .
- . . . 5 When Should You File Form 943-X? . . . . . .
- . . . 2 Who Needs To File Form W-2 and Form W-3? . . . . . .
  
Relevant passages: <extract relevant passages from the context here>
  User query: Who should file form 943?
  Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [34]:
inputs = tokenizer(augmented_prompt, return_tensors="pt").to('cuda')
output_prompt = llm_model.generate(**inputs, temperature=0.7, do_sample=True, max_new_tokens=512)
outputs_decoded = tokenizer.decode(output_prompt[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model output (decoded):
<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

Based on the following context items, please answer the query.
  Give yourself room to think by extracting relevant passages from the context before answering the query.
  Don't return the thinking, only return the answer.
  Make sure your answers are as explanatory as possible.
  
Now use the following context items to answer the user query:
  -.... 4 Who Must File Form 943?.....
-.... 3 Who Must File Form 8962......
-.. 6 Where Should You File Form 943-X?.......
-... 5 When Should You File Form 943-X?......
-... 2 Who Needs To File Form W-2 and Form W-3?......
  
Relevant passages: <extract relevant passages from the context here>
  User query: Who should file form 943?
  Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Based on the context, the relevant passage is:

".... 4 Who Must File Form 943?....."

According to this passage, the answer is:

"Employers who pay wage