# This code is generated by Grok

In [None]:
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings 
from langchain.prompts import PromptTemplate
import fitz  # PyMuPDF for PDF handling
from base64 import b64encode
from typing import Literal, List, Dict, Tuple, NoReturn


In [69]:

# Load environment variables from .env file
load_dotenv()


True

In [70]:
def load_azure_client() -> AzureOpenAI:
    """ This function will load all the credentials and setup GPT-4o-mini LLM model
        afterthat return as instance of Azure Open AI class

    Returns:
        AzureOpenAI: Instance of class (main execution) 
    """ 

    load_dotenv()

    deployment = "gpt-4o-mini"

    client = AzureOpenAI()

    return client



In [71]:
client = load_azure_client()

In [72]:

# Initialize embeddings for summaries
embedding_model = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002"
)


In [73]:

# Step 1-2: Assume PDF is provided and images are extracted to images/ with doc_id
def process_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            doc_id = f"doc_{page_num}_{img_index}"
            with open(f"images/{doc_id}.png", "wb") as f:
                f.write(base_image["image"])
    return True


In [74]:
process_pdf("content/attention.pdf")

True

In [75]:

online_transformer_image_path = "https://media.geeksforgeeks.org/wp-content/uploads/20250529164119039854/Screenshot-2024-05-24-112357-768.webp"

# Step 4: Generate summaries for each image using GPT-4o-mini
def generate_image_summary(image_path: str):

    client = load_azure_client()

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an expert vision assistant. Generate a summary for the given image only."
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in detail."},
                    {"type": "image_url", "image_url": {"url": image_path}}
                ]
            }
        ],
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0
    )

    summary: str = response.choices[0].message.content                  # fetching summary from llm response

    return response


In [76]:
# generate_image_summary(online_transformer_image_path)

In [None]:
# ===  Images summary generation ===
def generate_summary_of_each_image(
        extracted_images_folder_path: str, 
        verbose: bool = True
        
        ) -> Tuple[AzureOpenAI.chat, Dict[str, str], Dict[str, str]]:
    
    """ This function takes the path of folder in which extracted images are stores,
        and generate the summary of each image by the use of LLM, finally returned it.

    Args:
        extracted_images_folder_path (str): Path of folder that contains extracted images from PDF

    Returns:
        Dict[str, str]: Returns a python dictionary that contains the unique doc_id as key as 
                        generated summary of images as value
    """

    # checking provided folder path exists or not
    if not os.path.exists(extracted_images_folder_path):
        raise FileNotFoundError(f"Folder doesn't exists: {extracted_images_folder_path}. Try to provide full folder path.")

    ## loading model
    client = load_azure_client()

    # generating embeddings of summaries
    embed_summaries: Dict[str,str] = {}
    generated_summary: Dict[str,str] = dict()                               # defining variable which store generated summaries of each image
    images_path: List[str] = os.listdir(extracted_images_folder_path)       # fetching all images paths
    total_images_count = len(images_path)                                   # extracting total number of images

    ## iteration of each image
    for index,image_path in enumerate(images_path):
        doc_id = image_path.split('.')[0]                                   # extracting doc_id from image title
        
        # updating image path with its parent folder
        image_full_path = os.path.join(extracted_images_folder_path, image_path)
                               
        # fetching image data
        with open(image_full_path, "rb") as f:
            image_b64 = b64encode(f.read()).decode("utf-8")

        if verbose:
            print(f"Generating summary of image ({index+1}/{total_images_count}).... ")


        data_url = f"data:image/png;base64,{image_b64}"                     # creating data url variable

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert vision assistant."
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Generate a detailed summary for the given image only. Making sure you summarize everything what you see in the image."},
                        {"type": "image_url", "image_url": {"url": data_url}}
                    ]
                }
            ],
            max_tokens=4096,
            temperature=1.0,
            top_p=1.0
        )

        summary: str = response.choices[0].message.content                  # fetching summary from llm response
        summary_embedding = embedding_model.embed_query(summary)            # getting embedding of generated summary
        generated_summary.update({doc_id:summary})                          # add generated summary to summaries
        embed_summaries.update({doc_id:summary_embedding})                # add summary embedding to summary embeddings

        

    return response, generated_summary, embed_summaries

In [132]:
# _,summaries, embed_summaries = generate_summary_of_each_image('images')

In [133]:
# embed_summaries

In [134]:

# Step 5-6: Generate embeddings and save to FAISS
def save_to_vector_store():
    
    # getting summaries
    _,_,embed_summaries = generate_summary_of_each_image("images")

    for key, embed_summary in embed_summaries.items():
        doc_id = key
        vector_store = FAISS.from_embeddings(
            [(doc_id, embed_summary)],
            embedding=embedding_model
        )

    vector_store.save_local("faiss_index")

    return True


In [135]:
save_to_vector_store()

Generating summary of image (1/3).... 
Generating summary of image (2/3).... 
Generating summary of image (3/3).... 


True

In [156]:

# Step 7-9: Handle user query and display response
def query_system(user_query):
    vector_store = FAISS.load_local(
        folder_path = "faiss_index", 
        embeddings = embedding_model,
        allow_dangerous_deserialization = True
    )
    # results = vector_store.similarity_search(user_query, k=1)

    # creating retriever
    retriver = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={'score_threshold': 0.6}
    )

    results = retriver.invoke(user_query)

    if not results:
        print("I don't know (No image applicable for you this query)")
        return
    
    doc_id = results[0].page_content
    image_path = f"images/{doc_id}.png"

    # fetching image data
    with open(image_path, "rb") as f:
        image_b64 = b64encode(f.read()).decode("utf-8")

    data_url = f"data:image/png;base64,{image_b64}"  
    
    # Step 8: Pass image to LLM with prompt
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an expert vision assistant. Don't use phrases like 'In the context of image...' or its relevant."
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Answer the query in the context of provided image. Query: {user_query}"},
                    {"type": "image_url", "image_url": {"url": data_url}}
                ]
            }
        ],
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0
    )

    # Step 9: Display response
    return response.choices[0].message.content


In [157]:
response = query_system("why v is different than k")
response

'V (values) and K (keys) are two distinct components in the attention mechanism of neural networks. The primary difference lies in their roles within the attention process. \n\n1. **Purpose**: \n   - K (keys) represents information that the model uses to determine relevance when computing attention scores. It serves as the criteria against which queries will be matched.\n   - V (values), on the other hand, contains the actual information to be attended to or the output that will be produced based on the attention weights calculated from the queries and keys.\n\n2. **Computation**:\n   - The attention mechanism calculates similarity scores between queries (Q) and keys (K). These scores dictate how much emphasis should be placed on each value (V).\n   - After computing the attention scores, these scores are used to weight the values (V), determining how much information from V is passed on to subsequent layers.\n\nIn summary, while K is used to assess the relevance and scoring through qu

In [2]:
import os
os.path.exists('images')

True