# Qwen2-VL-7B-Instruct

In [None]:
! python -m pip install --upgrade pip

In [None]:
! pip install git+https://github.com/huggingface/transformers

In [None]:
! pip install -r qwen_requirement.txt

In [None]:
! pip install flash-attn --no-build-isolation

Need to restart kernel after installing library

In [1]:
import os
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm
import json


# Open log file
log_path = './results/qwen2-7B-V-Instruct/'
os.makedirs(log_path, exist_ok=True)
log_file = os.path.join(log_path, 'prediction_log.txt')
log_file = open(log_file, 'w')


# Load data
df_table_prd = pd.read_csv('../LMM_sewerML/results/df_table_prd.csv')
df_table_dsc = pd.read_csv('../LMM_sewerML/results/df_table_dsc.csv')
image_dir = '../LMM_sewerML/images'


# Load Qwen model and processor
model_id = "Qwen/Qwen2-VL-7B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)


# Define the prompt template
prompt_template = """
You are a virtual sewer technician with the capability to analyze images from CCTV cameras taken inside sewer pipes.
Your task is to examine each image and provide a concise, yet accurate, summary for retrieval. 
After summarizing, you must classify the image as defect types.

While providing the summary, remember the following guidelines:

1) Provide a general overview of the image that you see, describing important elements such image clarity, lighting conditions, type of pipe (concrete, PVC, ...), presence of water.
2) Check for defects in the sewer pipes in the image.
3) Pipes in good condition usually show a smooth, unbroken surface, no visible signs of damage like cracks or collapses, and an absence of blockages such as roots.
4) On the other hand, you can have the following defects:
4a) Cracks, Breaks, and Collapses (RB): Identify visible cracks along the pipe, instances where the pipe has fractured or completely broken apart, and areas where the pipe has collapsed.
This includes longitudinal cracks, circumferential breaks, and complete structural failures that compromise the integrity of the sewer system.
4b) Surface Damage (OB): Detect areas of the pipe's interior that exhibit signs of wear, erosion, or damage on the surface.
This includes minor scratches, pitting, scaling, or any form of deterioration that affects the pipe's surface but does not necessarily penetrate deeply into the structure.
4c) Production Error (PF): Identify defects that originated during the pipe's manufacturing process, such as inconsistent pipe thickness, improper joint alignment, or material imperfections.
These are flaws that were introduced before installation and could potentially affect the pipe's performance or longevity.
4d) Deformations (DE): Recognize any alterations in the shape of the pipe, such as bending, sagging, or bulging, that indicate a deformation.
This includes both minor deformations that may affect flow efficiency and major deformations that threaten the pipe's structural integrity.
4e) Roots (RO): Detect the presence of roots infiltrating the sewer pipe, whether through joints, cracks, or other vulnerabilities.
This involves identifying both the initial stages of root intrusion and the more advanced stages where roots have significantly obstructed the pipe.
5) Additional considerations while analyzing the images: do not consider blurred text or user-defined circled areas in the images.
6) You will always try to describe the image that you see. Provide the output in JSON format as follows:

{ "DESCRIPTION": "<Description of the image that you see>", "CODE": "<Defect Code>"}

Note: The "CODE" can be selected from "RB", "OB", "PF", "DE", "RO". If no defect is detected, set "CODE" to "NoDefect".
"""

# Store predictions and actual labels
predictions = []
actual_labels = []
descriptions = []


# Add tqdm progress bar
for idx, row in tqdm(df_table_prd.iterrows(), total=len(df_table_prd), desc="Processing images"):
    img_id = row['img_id']
    ground_truth = row['defect_type']
    img_path = os.path.join(image_dir, img_id)
    
    # Check if image file exists
    if not os.path.exists(img_path):
        log_file.write(f"Image {img_id} not found. Skipping.\n")
        continue

    # Load image
    # image = Image.open(img_path)

    # Prepare input for Qwen
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img_path},
                {"type": "text", "text": prompt_template},
            ],
        }
    ]
    
    # Prepare processor inputs
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate output
    generation_args = {
        "max_new_tokens": 1024,
    }
    outputs = model.generate(**inputs, **generation_args)

    generated_ids_trimmed = outputs[:, inputs.input_ids.shape[1]:]
    response = processor.decode(generated_ids_trimmed[0], skip_special_tokens=True)
    
    # Extract defect prediction and description from model output
    try:
        response_json = json.loads(response)
        description = response_json.get("DESCRIPTION", "No description provided")
        predicted_defect = response_json.get("CODE", "NoDefect")
    except json.JSONDecodeError:
        log_file.write(f"Invalid JSON response for Image ID {img_id}: {response}\n")
        description = "Error in JSON response"
        predicted_defect = "Error"

    # Append results to lists
    predictions.append(predicted_defect)
    actual_labels.append(ground_truth)
    descriptions.append(description)

    # Log intermediate outputs
    log_file.write(f"Image ID: {img_id} | Predicted: {predicted_defect} | Ground Truth: {ground_truth}\n")


# Calculate evaluation metrics
accuracy = accuracy_score(actual_labels, predictions)
f1 = f1_score(actual_labels, predictions, average='weighted')
report = classification_report(actual_labels, predictions)

log_file.write(f"\nAccuracy: {accuracy:.4f}\n")
log_file.write(f"F1 Score: {f1:.4f}\n")
log_file.write("\nClassification Report:\n")
log_file.write(report)


# Close log file
log_file.close()


# Save results to CSV files
df_table_prd['qwen-vl-7B'] = predictions
df_table_prd.to_csv(log_path + 'df_table_prd.csv', index=False)

df_table_dsc['qwen-vl-7B'] = descriptions
df_table_dsc.to_csv(log_path + 'df_table_dsc.csv', index=False)


RuntimeError: Failed to import transformers.models.qwen2_vl.modeling_qwen2_vl because of the following error (look up to see its traceback):
Detected that PyTorch and torchvision were compiled with different CUDA major versions. PyTorch has CUDA Version=12.1 and torchvision has CUDA Version=11.8. Please reinstall the torchvision that matches your PyTorch install.