In [1]:
import base64
from io import BytesIO
from PIL import Image
from langchain_ollama import ChatOllama
import os
from tqdm import tqdm 
import shutil
from pdf2image import convert_from_path
from mm_embedding import mcdse
import pickle 
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser

# Loading multimodal model

In [3]:
llm = ChatOllama(model='llama3.2-vision')
llm.invoke('what is 1 +1?')

AIMessage(content='The answer to 1 + 1 is:\n\n2!', additional_kwargs={}, response_metadata={'model': 'llama3.2-vision', 'created_at': '2025-01-21T12:33:07.944789567Z', 'done': True, 'done_reason': 'stop', 'total_duration': 2649250798, 'load_duration': 13414148, 'prompt_eval_count': 17, 'prompt_eval_duration': 227000000, 'eval_count': 13, 'eval_duration': 2407000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-19eff4d2-5067-48dd-a41e-195f698ddc95-0', usage_metadata={'input_tokens': 17, 'output_tokens': 13, 'total_tokens': 30})

# Downloding multimodal embedding model

In [None]:
# from modelscope import snapshot_download
# model_dir = snapshot_download('AI-ModelScope/mcdse-2b-v1', cache_dir ='D:\\Users\\a0105')

# Loading multimodal embedding model

In [8]:
mcdse_model = mcdse(device='cpu')

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


In [None]:
a = mcdse_model.encode_queries(['what is the revenue of the company']).tolist()[0]

# Create vectorstore

In [None]:
def delete_folder_contents(folder_path):
    """
  Deletes all files and subfolders within the specified folder.

  Args:
    folder_path: The path to the folder to be emptied.
    """
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path) 
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")

In [None]:
%time
pages = convert_from_path('./NVIDIA-2024-Annual-Report.pdf')
delete_folder_contents('./images')
# Save each page as a JPEG file using Pillow
for i, page in enumerate(pages):
    page.save(f'./images/page_{str(i).zfill(5)}.jpg', 'JPEG')

In [5]:
# embedding image

img_list = []
for i in os.listdir('./images'):
    i_img = Image.open('./images/'+ i )
    img_list.append(i_img)
len(img_list)

187

In [None]:
vects = []
for i in tqdm(img_list):
    vects.append( mcdse_model.encode_documents([i]).tolist()[0] ) 

In [None]:
with open("vectors.pkl", 'wb') as f: 
    pickle.dump(vects, f)

In [4]:
with open("vectors.pkl", 'rb') as f: 
    vects = pickle.load(f)
len(vects)

187

In [29]:
vects[:2]

[[-0.051513671875,
  0.01953125,
  0.0206298828125,
  0.03466796875,
  -0.060302734375,
  0.00335693359375,
  0.045654296875,
  0.05615234375,
  0.1162109375,
  -0.0014190673828125,
  -0.05224609375,
  0.048583984375,
  -0.033935546875,
  -0.049560546875,
  -0.04638671875,
  0.007110595703125,
  0.053955078125,
  -0.01348876953125,
  -0.0185546875,
  0.0703125,
  -0.035400390625,
  0.0257568359375,
  -0.10595703125,
  0.032470703125,
  -0.01043701171875,
  -0.00014019012451171875,
  -0.0244140625,
  -0.056884765625,
  -0.11474609375,
  -0.040771484375,
  0.00628662109375,
  0.03564453125,
  0.021484375,
  0.0751953125,
  -0.0040283203125,
  -0.0703125,
  0.047119140625,
  0.01123046875,
  -0.07275390625,
  0.0390625,
  -0.045654296875,
  0.00183868408203125,
  -0.00982666015625,
  0.0517578125,
  0.06982421875,
  -0.04931640625,
  -0.005157470703125,
  0.027587890625,
  0.014404296875,
  -0.078125,
  -0.1318359375,
  -0.00701904296875,
  -0.026123046875,
  -0.03955078125,
  -0.02294921

In [6]:
from chromadb import Client

client = Client() 
collection = client.create_collection(
    name="my_collection",
)
collection.add(documents=os.listdir('./images'), embeddings= vects, ids=[str(i).zfill(5) for i in range(len(vects))])

# Similarity Search

In [22]:
query = 'what is the revenue increment'

In [None]:
%time
query_output = collection.query(
    mcdse_model.encode_queries([
        query
    ]).tolist()[0])

In [11]:
query_output['documents'][0]

['page_00023.jpg',
 'page_00122.jpg',
 'page_00135.jpg',
 'page_00138.jpg',
 'page_00139.jpg',
 'page_00032.jpg',
 'page_00149.jpg',
 'page_00137.jpg',
 'page_00123.jpg',
 'page_00178.jpg']

In [12]:
def convert_to_base64(pil_image):
    """
    Convert PIL images to Base64 encoded strings

    :param pil_image: PIL image
    :return: Re-sized Base64 string
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")  # You can change the format if needed
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

In [20]:
image_b64 = convert_to_base64(
Image.open(f"./images/{query_output['documents'][0][0]}") )

In [23]:
def prompt_func(data):
    text = data["text"]
    image = data["image"]
    image_part = {
        "type": "image_url",
        "image_url": f"data:image/jpeg;base64,{image}",
    }
    content_parts = []
    text_part = {"type": "text", "text": text}
    content_parts.append(image_part)
    content_parts.append(text_part)
    return [HumanMessage(content=content_parts)]


chain = prompt_func | llm | StrOutputParser()
query_chain = chain.invoke(
    {"text": query, "image": image_b64}
)

print(query_chain)

The revenue increment is 126% to a record $60.9 billion.


### Wrap as a function and test

In [24]:
def RAG_test(query): 
    query_output = collection.query(
        mcdse_model.encode_queries([query]).tolist()[0])
    imgs = query_output['documents'][0]
    print(imgs[:5])
    image_b64 = convert_to_base64(Image.open(f"./images/{imgs[0]}") )
    query_chain = chain.invoke(
    {"text": query, "image": image_b64} )
    print(query_chain)

In [25]:
RAG_test("who is the CEO of Nvidia?")

['page_00049.jpg', 'page_00185.jpg', 'page_00103.jpg', 'page_00111.jpg', 'page_00104.jpg']
Jensen Huang.


In [26]:
RAG_test("how much is the revenue of Nivida in 2024?")

['page_00149.jpg', 'page_00154.jpg', 'page_00135.jpg', 'page_00178.jpg', 'page_00151.jpg']
The revenue for 2024 is $60,922 million.


In [27]:
RAG_test("how much is the income of Nivida in 2024?")

['page_00174.jpg', 'page_00149.jpg', 'page_00151.jpg', 'page_00150.jpg', 'page_00135.jpg']
The image presents a financial statement for NVIDIA Corporation, detailing its income and expenses from January 28, 2024, to January 30, 2022. The relevant section for determining NVIDIA's income in 2024 is "Income before income taxes," which lists the total income before taxes as $33,818 million.

**Key Points:**

*   **Year-End Date:** The financial statement covers a period ending on January 28, 2024.
*   **Income Before Taxes:** The company's total income before taxes for this period is $33,818 million.

**Conclusion:**
NVIDIA Corporation's income in 2024 was $33,818 million.


In [28]:
RAG_test("what makes you think the high increment of the revenue of Nvidia in 2024 ? ")

['page_00023.jpg', 'page_00135.jpg', 'page_00149.jpg', 'page_00150.jpg', 'page_00138.jpg']
The high increment of NVIDIA's revenue in 2024 can be attributed to several factors:

1. **Accelerated Computing**: NVIDIA has been at the forefront of accelerated computing, which enables faster processing and improved performance for various applications such as gaming, scientific simulations, and artificial intelligence (AI). The company's leadership in this area contributes significantly to its revenue growth.

2. **Artificial Intelligence (AI) Adoption**: The increasing adoption of AI across industries is driving demand for NVIDIA's products and services. As more companies integrate AI into their operations, they require powerful computing hardware and software from NVIDIA to support these efforts.

3. **Gaming Industry Growth**: The gaming industry continues to experience steady growth, with the global market projected to reach $190 billion by 2025. NVIDIA's graphics processing units (GPUs)