# IMPORTS

In [1]:
import fitz
import tabula
from PIL import Image
from io import BytesIO
import base64
import requests
from bs4 import BeautifulSoup
import pandas as pd

# LLM
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# RAG
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

from langchain.chains import RetrievalQA

# EXTRACT

In [2]:
def extract_from_pdf(pdf_path):

    # Open the PDF file
    doc = fitz.open(pdf_path)
    
    
    # Extract text from each page
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text() + "\n"


    # Extract tables from the PDF file
    tables = tabula.read_pdf(pdf_path, pages="all", multiple_tables=True)
    
    # Extract images from the PDF
    extracted_images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert image to JPEG
            img_pil = Image.open(BytesIO(image_bytes))
            img_pil = img_pil.convert("RGB")  # Ensure compatibility for JPEG
            img_buffer = BytesIO()
            img_pil.save(img_buffer, format="JPEG")

            # Encode image bytes as base64
            encoded_image = base64.b64encode(img_buffer.getvalue()).decode('utf-8')

            # Store image in list with metadata
            extracted_images.append({
                "page": page_num + 1,
                "index": image_index + 1,
                "format": "jpeg",  # JPEG format
                "image_base64": encoded_image,  # Base64-encoded image
                "width": img_pil.width,
                "height": img_pil.height
            })

    return {
        'text': text,
        'tables': tables,
        'images': extracted_images
    }

In [3]:

def extract_content_from_website(url):
    # Fetch webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text (all text in the body)
    text = soup.get_text()

    # Extract tables and convert them to pandas DataFrame
    tables = []
    for table in soup.find_all('table'):
        table_data = []
        headers = [header.get_text(strip=True) for header in table.find_all('th')]
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all('td')
            row_data = [cell.get_text(strip=True) for cell in cells]
            if row_data:
                table_data.append(row_data)
        
        if table_data:
            df = pd.DataFrame(table_data, columns=headers)
            tables.append(df)

    # Extract images and encode them in base64
    images = []
    for img_tag in soup.find_all('img'):
        img_url = img_tag.get('src')
        if img_url:
            # Handle relative image URLs by converting them to absolute URLs
            if not img_url.startswith('http'):
                img_url = requests.compat.urljoin(url, img_url)
            try:
                img_response = requests.get(img_url)
                img = Image.open(BytesIO(img_response.content))
                img_format = img.format  # JPEG, PNG, etc.
                
                # Convert image to base64
                img_buffer = BytesIO()
                img.save(img_buffer, format=img_format)
                encoded_image = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
                
                # Store image data in metadata
                images.append({
                    'url': img_url,
                    'base64': encoded_image,
                    'format': img_format,
                    'width': img.width,
                    'height': img.height
                })
            except Exception as e:
                pass

    return {
        'text': text,
        'tables': tables,
        'images': images
    }

# ADD TO RAG

In [None]:
groq_api_key = "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"

# Initialize Groq LLM (using Mixtral for best performance)
Table_Checker = ChatGroq(temperature=0, groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768", max_tokens=1)
Table_Extractor = ChatGroq(temperature=0, groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")
Chat = ChatGroq(temperature=0, groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")

# Initialize embedding model
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Initialize ChromaDB
vector_db = Chroma(embedding_function=embeddings)

  from .autonotebook import tqdm as notebook_tqdm
  vector_db = Chroma(embedding_function=embeddings)


In [13]:
# Function to add text to RAG
def add_text_to_rag(text, custom_id):
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    
    # Split the text into chunks
    texts = text_splitter.split_text(text)
    
    # Create Document objects with the provided custom ID
    documents = []
    for text_chunk in texts:
        doc = Document(page_content=text_chunk, metadata={'source': custom_id})
        documents.append(doc)
    
    # Add documents to ChromaDB with the custom ID
    vector_db.add_documents(documents)

    '''documents = [
        Document(page_content="Document content 1", metadata={"source": "pdf_123"}),
        Document(page_content="Document content 2", metadata={"source": "pdf_123"}),
        Document(page_content="Document content 3", metadata={"source": "pdf_456"})
    ]

    # Corresponding IDs for the documents
    ids = ["doc_1", "doc_2", "doc_3"]

    # Add documents to ChromaDB
    vector_db.add_documents(documents=documents, ids=ids)'''

# Function to filter and extract information from tables
def extract_info_from_table(table, custom_id):

    prompt_template = ChatPromptTemplate(
        input_variables=["table"],
        messages=[
            {"role": "user", "content": "{table}\n\nAnalyze the above table and tell me whether it is a proper table or no. Just return 'True' or 'False' based on you decision. I want no other output."}
        ]
    )

    llm_chain = LLMChain(llm=Table_Checker, prompt=prompt_template)
    extracted_text = llm_chain.run(table=table)

    good_table = True if 'True' in extracted_text else False

    if not good_table:
        return

    # Define a prompt template for table summarization
    prompt_template = ChatPromptTemplate(
        input_variables=["table"],
        messages=[
            {"role": "user", "content": "{table}\n\nRead the above table and get each of its records in proper serialized format."}
        ]
    )

    # Run LLM on the table data
    llm_chain = LLMChain(llm=Table_Extractor, prompt=prompt_template)
    extracted_text = llm_chain.run(table=table)

    add_text_to_rag(extracted_text, custom_id)


def delete_documents_by_custom_id(custom_id):
    vector_db.delete(where={'source': {'$eq': custom_id}})

In [6]:
# Example usage
pdf_path = "1706.03762v7.pdf"
pdf_content = extract_from_pdf(pdf_path)
ID = 100

add_text_to_rag(pdf_content['text'], ID)
for table in pdf_content['tables']:
    extract_info_from_table(table, ID)

# Example usage
url = 'https://builtin.com/artificial-intelligence/deepseek-r1'  # Replace with the URL of the website you want to scrape
website_content = extract_content_from_website(url)
ID = 200

add_text_to_rag(website_content['text'], ID)
for table in website_content['tables']:
    extract_info_from_table(table, ID)

  llm_chain = LLMChain(llm=Table_Checker, prompt=prompt_template)
  extracted_text = llm_chain.run(table=table)


# CHAT

In [7]:
# Function to run RAG-based QA
def ask_question(question):
    retriever = vector_db.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(llm=Chat, retriever=retriever)
    response = qa_chain.run(question)
    return response

In [18]:
print(ask_question("write the mathematic for attention."))

Number of requested results 4 is greater than number of elements in index 0, updating n_results = 0


I'm sorry, I'm not sure I understand your question. Are you asking for a mathematical formula related to attention or focus? If so, I'm afraid I don't have any specific mathematical formulas to share. Attention and focus are complex cognitive processes that are studied in psychology and neuroscience, and while there may be mathematical models used to describe certain aspects of these processes, there is no simple formula that can capture their full complexity. If you could provide more context or clarify your question, I would be happy to try and help further.
