# RAG with PDF files and OCR

This demo provides an example where retrieval is performed on PDF files.

Note: To run the code with your own data, simply update the folder path (or the `folder_path` variable) accordingly.

In [None]:
# Check if the notebook is running on Google Colab, and if so, install the dependencies
try:
    # Mount Google Drive to access files and directories
    from google.colab import drive
    drive.mount('/content/drive')
    %cd ./drive/MyDrive/

    # Clone Git repository
    !git clone --quiet https://github.com/antoninomariarizzo/rag.git
    %cd ./rag/
    !python -m pip install -e .

    # Install dependencies
    !apt-get install -y tesseract-ocr
    %pip install -r requirements.txt
    
    print("Running on Google Colab")
except ImportError:
    print("Not running on Google Colab")

In [None]:
# Login to Hugging Face
import os
from dotenv import load_dotenv
from huggingface_hub import login, whoami

load_dotenv()
token = os.getenv("HF_TOKEN")

if token:
    print("Token loaded successfully!")
else:
    print("Token not found in the .env file")

login(token=token)

# Check if the login was successful 
try:
    user_info = whoami()
    print(f"Logged in successfully as: {user_info['name']}")
except Exception as e:
    print(f"Login failed: {str(e)}")

In [None]:
%load_ext autoreload
%autoreload 2

### Import libraries

In [None]:
import os
from src.PDFProcessor import PDFProcessor
from src.LargeLanguageModel import LargeLanguageModel
from src.IndexManager import IndexManager

### Set paths

In [None]:
folder_path = "pdf_files/"
index_path = 'resources/faiss_embeddings.index'

### Read PDF files from folder  
Extract text from PDF files in the specified folder (e.g., `folder_path = "pdf_files/"`)

In [None]:
texts, info = PDFProcessor.extract_text_from_pdfs_in_folder(folder_path)

### Split into smaller chunks for better retrieval

In [None]:
chunks, info = IndexManager.chunk_texts_and_info(texts, info)

### Create or Load the Embedding Index

Prepare dataset of embeddings from text for the retrieval. Build an index for efficient similarity search.

Find relevant text and then use an LLM to generate a response based on this text.

In [None]:
# Initialize LLM and IndexManager
llm = LargeLanguageModel(model_id="meta-llama/Llama-3.2-3B-Instruct")
index_manager = IndexManager(chunks, info, llm)

In [None]:
if not os.path.isfile(index_path):
    print("Create embedding index")
    index = index_manager.create_index(texts,
                                       index_path)
else:
    print("Load embedding index")
    index = index_manager.load_index(index_path)

### Query PDF documents

##### Retrieval

Retrieves the top-k relevant references, and constructs a prompt for the LLM

In [None]:
query = "What does this document say about attention mechanism?"
references = index_manager.query(text=query, top_k=5)

context = "\n".join([text for text, _ in references])
prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

##### Generate answer

In [None]:
answer = llm.decode(prompt)

print("Answer:")
print(answer)
print("\nReferences:")
for text_cur, info_cur in references:
    fname, page_num = info_cur.split()
    print(f"- {fname}, page {page_num}: {text_cur[:20]}")