### FLOPS Calculation using CALFLOPS

In [None]:
import os
import gc
import torch
import requests
import warnings
import traceback
from PIL import Image
from calflops import calculate_flops
from transformers import AutoModel, AutoProcessor, AutoModelForCausalLM, AutoTokenizer

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:

# --- Script Configuration ---

# Suppress unnecessary Hugging Face warnings
warnings.filterwarnings("ignore")

# --- Model & Input Configuration ---

# List of full-precision (FP16/FP32) models to analyze
MODEL_IDS = [
    # "facebook/flava-full",
    # "uclanlp/visualbert-vqa-coco-pre",
    # "dandelin/vilt-b32-finetuned-vqa",
    "microsoft/Florence-2-large"
    # "meta-llama/Meta-Llama-3.1-8B-Instruct",
    # 'microsoft/Phi-3-vision-128k-instruct',
    # "Qwen/Qwen2-VL-2B-Instruct",
    # "HuggingFaceTB/SmolVLM-Instruct",
    # "microsoft/Phi-3-vision-128k-instruct",
]

# Common settings for dummy inputs
DUMMY_PROMPT = "Describe the contents of the image in detail."
DUMMY_IMAGE = Image.new('RGB', (336, 336), 'blue') # A common VLM input size

# --- Hardware and Precision Settings ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Use float16 for GPU to fit larger models and for performance.
# Use float32 for CPU as it's the standard.
TORCH_DTYPE = torch.float32

print(f"Using device: {DEVICE} with dtype: {TORCH_DTYPE}")
print("NOTE: For 'meta-llama/Meta-Llama-3.1-8B-Instruct', you must be logged into Hugging Face")
print("with an account that has been granted access to the model.")


# --- Main Analysis Loop ---

for model_id in MODEL_IDS:
    print(f"\n{'='*40}\nAnalyzing Model: {model_id}\n{'='*40}")

    # Initialize variables for the loop to ensure proper cleanup
    model = None
    processor = None
    tokenizer = None
    inputs = None

    try:
        # Some models require trusting remote code to load custom architectures
        trust_remote_code = any(m in model_id for m in ["Qwen", "Phi-3", "Florence-2", "SmolVLM"])
    
        # Using "eager" attention for consistent FLOPs calculation. Profilers like calflops
        # may not correctly hook into optimized implementations like Flash Attention, leading
        # to inaccurate (often lower) FLOPs counts.
        attn_implementation = "eager"
    
        # --- Model-Specific Loading and Input Preparation ---
    
        if "flava" in model_id:
            model = AutoModel.from_pretrained(
                model_id,
                torch_dtype=TORCH_DTYPE,
            ).to(DEVICE)
            processor = AutoProcessor.from_pretrained(model_id)
            inputs = processor(
                text=[DUMMY_PROMPT],
                images=[DUMMY_IMAGE],
                return_tensors="pt"
            ).to(device=DEVICE)
    
        elif "Llama-3.1" in model_id:
            # This is a large, gated model. Requires login and GPU with significant VRAM.
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=TORCH_DTYPE,
                device_map="auto" # Helps distribute the model across available GPUs/CPU RAM
            )
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            messages = [{"role": "user", "content": "What are the main differences between FLOPs and MACs?"}]
            # Apply the model-specific chat template for correct input formatting
            input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
    
        elif "Florence-2" in model_id:
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=TORCH_DTYPE,
                trust_remote_code=trust_remote_code,
            ).to(DEVICE)
            processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
            # Florence-2 uses task-specific prompts in the text input
            prompt_with_task = "<MORE_DETAILED_CAPTION>"
            inputs = processor(
                text=prompt_with_task,
                images=[DUMMY_IMAGE],
                return_tensors="pt"
            )
            inputs['decoder_input_ids'] = inputs['input_ids'].to(device=DEVICE)
    
        elif "smolvlm" in model_id.lower():
            from transformers import AutoModelForVision2Seq
    
            processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
            model = AutoModelForVision2Seq.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,
            ).to(DEVICE)
    
            messages = [
                        {
                            "role": "user",
                            "content": [
                                {"type": "image"},
                                {"type": "text", "text": DUMMY_PROMPT}
                            ]
                        },
                      ]
            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
            inputs = processor(text=prompt, images=[DUMMY_IMAGE], return_tensors="pt")
            inputs = inputs.to(DEVICE)
    
        elif "qwen" in model_id.lower():
            from transformers import Qwen2VLForConditionalGeneration
            model = Qwen2VLForConditionalGeneration.from_pretrained(
              "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
            )
            
            processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
            
            messages = [
                      {
                          "role": "user",
                          "content": [
                              {"type": "image"},
                              {"type": "text", "text": DUMMY_PROMPT},
                          ],
                      }
                    ]
            
            # Preparation for inference
            text = processor.apply_chat_template(
              messages, tokenize=False, add_generation_prompt=True
            )
            inputs = processor(
            text=[text], images=[DUMMY_IMAGE], padding=True, return_tensors="pt"
            )
            inputs = inputs.to(DEVICE)
    
        elif "vilt" in model_id.lower():

            url = "http://images.cocodataset.org/val2017/000000039769.jpg"
            image = Image.open(requests.get(url, stream=True).raw)
            text = "How many cats are there?"
    
            from transformers import ViltProcessor, ViltForQuestionAnswering
            model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
            processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
            
            inputs = processor(image, text, return_tensors="pt")
            inputs = inputs.to(DEVICE)

        elif "phi-3" in model_id.lower():

            print('Exec')
    
            from transformers import ViltProcessor, ViltForQuestionAnswering
            model = AutoModelForCausalLM.from_pretrained(model_id, device_map = "cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') 
            processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) 

            message = [ 
                {"role": "user", "content": "<|image_1|>\nWhat is shown in these images? Strictly choose answer from options without any extra text: \nBank \nCollege \nSchool \nLibrary"}, 
            ]
            url1= "/home/aritrad/test_images/Capture_2.JPG" 
            image = Image.open(url1) 

            combinedPrompt = processor.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
            inputs = processor( combinedPrompt, [image], return_tensors="pt").to("cuda:0") 

        else: # Generic handler for modern conversational VLMs (Qwen2-VL, SmolVLM, Phi-3-Vision)
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                _attn_implementation=attn_implementation,
                device_map = 'auto'
            )
            
            # AutoProcessor bundles the tokenizer and image processor for VLMs
            processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
            
            # Construct the chat prompt based on each model's required format
            if "Phi-3-vision" in model_id:
                messages = [{"role": "user", "content": f"<|image_1|>\n{DUMMY_PROMPT}"}]
            
            prompt = processor.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            # The processor correctly handles both text and images for the model input
            inputs = processor(
                text=prompt,
                images=[DUMMY_IMAGE],
                return_tensors="pt"
            )

        # --- FLOPs Calculation ---
        if inputs:
            # `calflops` can accept the inputs dictionary directly as keyword arguments
            flops, macs, params = calculate_flops(
                model=model,
                kwargs=dict(inputs),
                print_results=False, # We will print our own formatted output
                print_detailed=False,
            )
            print(f"✅ Successfully analyzed {model_id}")
            print(f"   - Total Parameters: {params}")
            print(f"   - MACs: {macs}")
            print(f"   - FLOPs (Floating Point Operations): {flops}")

    except Exception as e:
        import traceback
        print(f"❌ Failed to process {model_id}. Error: {e}")
        print(traceback.format_exc())
        # Uncomment the line below for a full error stack trace if you are debugging
        # traceback.print_exc()

    finally:
        # --- Cleanup to free CPU/GPU memory for the next model ---
        del model, processor, tokenizer, inputs
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

print(f"\n{'='*40}\nAnalysis Complete.\n{'='*40}")