In [1]:
# import os
# # Configure GPUs
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1" 

In [2]:
# At the start of your notebook, add GPU detection and configuration
import torch

def get_optimal_device_config():
    if not torch.cuda.is_available():
        return {"rag_device": "cpu", "vl_device": "cpu"}
    
    num_gpus = torch.cuda.device_count()
    total_memory = {i: torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)}
        
    if num_gpus >= 2:
        # Use separate GPUs - choose the ones with most available memory
        gpu0, gpu1 = sorted(total_memory.items(), key=lambda kv: kv[1], reverse=True)[:2]
        return {
            "rag_device": f"cuda:{gpu0[0]}", 
            "vl_device": f"cuda:{gpu1[0]}"
        }
    else:
        # Single GPU - use the same device for both
        return {
            "rag_device": "cuda:0",
            "vl_device": "cuda:0"
        }

# Get optimal device configuration
device_config = get_optimal_device_config()

In [3]:
# !wget "https://ir.tesla.com/_flysystem/s3/sec/000162828024002390/tsla-20231231-gen.pdf" -O Tesla_2023_Annual_Report.pdf
!wget "https://ikddata.ilmkidunya.com/images/books/11th-physics-book-ptb-ch2.pdf" -O physics.pdf 

--2024-12-31 12:17:12--  https://ikddata.ilmkidunya.com/images/books/11th-physics-book-ptb-ch2.pdf
Resolving ikddata.ilmkidunya.com (ikddata.ilmkidunya.com)... 54.39.132.143
Connecting to ikddata.ilmkidunya.com (ikddata.ilmkidunya.com)|54.39.132.143|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4194810 (4.0M) [application/pdf]
Saving to: ‘physics.pdf’


2024-12-31 12:17:12 (12.6 MB/s) - ‘physics.pdf’ saved [4194810/4194810]



In [4]:
# !pip install --upgrade git+https://github.com/huggingface/transformers.git 
# !pip install -qU byaldi accelerate qwen_vl_utils pdf2image
# !uv pip install flash-attn
# !sudo apt-get install -y poppler-utils 

!pip install --quiet --upgrade git+https://github.com/huggingface/transformers.git
!pip install -qU byaldi qwen_vl_utils pdf2image
!pip install -qU flash-attn
!sudo apt-get install -y poppler-utils > /dev/null 2>&1

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.9/517.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m336.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
# !wget https://arxiv.org/pdf/2409.06697 -O 1.pdf

In [6]:
from byaldi import RAGMultiModalModel
# Optionally, you can specify an `index_root`, which is where it'll save the index. It defaults to ".byaldi/".
RAG = RAGMultiModalModel.from_pretrained(
    "vidore/colqwen2-v1.0",
    device=torch.device(device_config["rag_device"])
) 

# use colpali 1 etc ~small model      or
# use bitsandbytes to quantized version (byaldi github issues)   or load from hugging face directly(colipali is supported yet)

Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


adapter_config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/74.0M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [7]:
%%time
RAG.index(
    input_path="/kaggle/working/physics.pdf", 
    index_name="physics", 
    store_collection_with_index=True, # Whether the index should store the base64 encoded documents.
    overwrite=False 
)

Added page 1 of document 0 to index.
Added page 2 of document 0 to index.
Added page 3 of document 0 to index.
Added page 4 of document 0 to index.
Added page 5 of document 0 to index.
Added page 6 of document 0 to index.
Added page 7 of document 0 to index.
Added page 8 of document 0 to index.
Added page 9 of document 0 to index.
Added page 10 of document 0 to index.
Added page 11 of document 0 to index.
Added page 12 of document 0 to index.
Added page 13 of document 0 to index.
Added page 14 of document 0 to index.
Added page 15 of document 0 to index.
Added page 16 of document 0 to index.
Added page 17 of document 0 to index.
Added page 18 of document 0 to index.
Added page 19 of document 0 to index.
Added page 20 of document 0 to index.
Added page 21 of document 0 to index.
Added page 22 of document 0 to index.
Added page 23 of document 0 to index.
Added page 24 of document 0 to index.
Added page 25 of document 0 to index.
Added page 26 of document 0 to index.
Index exported to .by

{0: '/kaggle/working/physics.pdf'}

In [8]:
from pdf2image import convert_from_path

images = convert_from_path('/kaggle/working/physics.pdf')

In [9]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-2B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

def can_use_flash_attention():
    """Check if the GPU supports Flash Attention 2"""
    if not torch.cuda.is_available():
        return False
    
    # Get compute capability of the GPU
    compute_capability = torch.cuda.get_device_capability()
    # Flash Attention requires Ampere (compute capability >= 8.0) or newer
    return compute_capability[0] >= 8

# Model initialization with dynamic attention implementation
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", 
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2" if can_use_flash_attention() else "sdpa",  # fallback to sdpa
    device_map=device_config["vl_device"]
).eval()


# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
min_pixels = 256*28*28
# max_pixels = 1280*28*28
max_pixels = 1024*28*28 # Play here
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [14]:
query = """tell me more about a boy jumping in the pool.
"""
results = RAG.search(query, k=2)

pages = []
for result in results:
    pages.append(result['page_num'])
    


In [15]:
# Inputs

image_index = [page - 1 for page in pages]
# Construct the messages list dynamically
messages = [
    {
        "role": "user",
        "content": [
            *[{"type": "image", "image": images[index]} for index in image_index],  # Separate entries for each image
            {"type": "text", "text": query},  # Text query
        ],
    }
]

In [16]:
# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = {k: v.to(device_config["vl_device"]) for k, v in inputs.items()}

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)         # optimize the no of tokens

generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
]

output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print(output_text)


["The boy in the image is jumping into a pool. The forces acting on him can be analyzed using the principles of equilibrium. The forces acting on the boy are the weight of the boy, the upward force of the water, and the upward force of the pool's edge. The weight of the boy is 300 N, and the upward force of the water is 200 N. The upward force of the pool's edge is 300 N. The net force acting on the boy is the sum of these three forces, which is 700 N. Since the boy is at rest, the net"]


In [1]:
# from PIL import Image as PILImage
# from IPython.display import display
# # print(image_index)
# for idx in image_index:
#     print(f'displaying image id: {idx}')
#     display(images[idx])