source: https://github.com/jerryjliu/llama_index/blob/main/examples/multimodal/Multimodal.ipynb

In [None]:
%pip install -Uqq llama-index langchain

In [1]:
from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex
from llama_index.readers.file.base import (
    DEFAULT_FILE_EXTRACTOR, 
    ImageParser,
)
from llama_index.response.notebook_utils import (
    display_response, 
    display_image,
)
from llama_index.indices.query.query_transform.base import (
    ImageOutputQueryTransform,
)

In [2]:
image_parser = ImageParser(parse_text=True)
file_extractor = DEFAULT_FILE_EXTRACTOR
file_extractor.update(
{
    ".jpg": image_parser,
    ".png": image_parser,
    ".jpeg": image_parser,
})

# NOTE: we add filename as metadata for all documents
filename_fn = lambda filename: {'file_name': filename}

In [3]:
img_reader = SimpleDirectoryReader(
    input_dir='img', 
    file_extractor=file_extractor, 
    file_metadata=filename_fn,
)
imgs = img_reader.load_data()

Downloading (…)rocessor_config.json: 100%|██████████| 362/362 [00:00<00:00, 103kB/s]
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Downloading (…)okenizer_config.json: 100%|██████████| 536/536 [00:00<00:00, 146kB/s]
Downloading (…)ncepiece.bpe.model";: 100%|██████████| 1.30M/1.30M [00:00<00:00, 13.5MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 4.02M/4.02M [00:00<00:00, 8.03MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 1.52k/1.52k [00:00<00:00, 428kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 335/335 [00:00<00:00, 118kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 4.74k/4.74k [00:00<00:00, 1.47MB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 806M/806M [00:21<00:00, 37.1MB/s] 


In [18]:
from pprint import pprint  
pprint(imgs[0].text)

('<s_menu><s_nm> IFO broadcast</s_nm><s_cnt> A</s_cnt><s_price> B<sep/><s_nm> '
 'ml G</s_nm><s_cnt> 1</s_cnt><s_price> ml</s_price><sep/><s_nm> ml '
 'M2</s_nm><s_cnt> 3</s_cnt><s_price> '
 'm2</s_price></s_menu><s_sub_total><s_subtotal_price> node must be delivered '
 'in the</s_subtotal_price></s_sub_total><s_total><s_total_price> they were '
 'sent.</s_total_price></s_total>')


In [4]:
imgs_index = GPTSimpleVectorIndex(imgs)

INFO:llama_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 184 tokens


In [5]:
img_response = imgs_index.query(
    'Explain the messages sent in the given diagram about FIFO broadcast',
    query_transform=ImageOutputQueryTransform(width=400)
)

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 399 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 14 tokens


In [6]:
display_response(img_response)

**`Final Response:`** The given diagram is an illustration of a FIFO (First In First Out) broadcast. It shows the order in which messages are sent and received. The diagram shows three messages, each with a name, count, and price. The first message is IFO broadcast with a count of A and a price of B. The second message is ml G with a count of 1 and a price of ml. The third message is ml M2 with a count of 3 and a price of m2. The diagram also shows a subtotal and a total, indicating the total cost of the messages sent. 

<img src="img/fifo.png" width="400" />

---

**`Source Node 1/1`**

**Document ID:** 7591b551-3524-4726-b769-98f1e66c10b6<br>**Similarity:** 0.8041960290914032<br>**Text:** file_name: img/fifo.png

<s_menu><s_nm> IFO broadcast</s_nm><s_cnt> A</s_cnt><s_price> B<sep/><s_...<br>