In [1]:
# First, make sure you've set your Colab runtime to GPU (Runtime -> Change runtime type -> GPU)

# Install necessary libraries with specific transformers version
!pip install transformers==4.44.2  # <--- Change this line!
!pip install bitsandbytes mmengine flash-attn opencv-python
!pip install xtuner[deepspeed]==0.1.23
!pip install timm==1.0.9
!pip install mmdet==3.3.0
!pip install hydra-core==1.3.2
!pip install ninja==1.11.1
!pip install decord==0.6.0
!pip install peft==0.11.1
!pip install pycocotools

!git clone https://github.com/shuyansy/EarthMind.git

%pwd
%ls
%cd EarthMind
%ls


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m122.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers


In [2]:
%%writefile demo.py
import argparse
import os

from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

import cv2
try:
    from mmengine.visualization import Visualizer
except ImportError:
    Visualizer = None
    print("Warning: mmengine is not installed, visualization is disabled.")


def parse_args():
    parser = argparse.ArgumentParser(description='Video Reasoning Segmentation')
    parser.add_argument('--image_folder', default="demo_images", help='Path to image file')
    parser.add_argument('--model_path', default="sy1998/EarthMind-4B")
    parser.add_argument('--work-dir', default="result", help='The dir to save results.')
    parser.add_argument('--text', type=str, default="<image>Please segment the left chimney.")
    parser.add_argument('--select', type=int, default=-1)
    args = parser.parse_args()
    return args


def visualize(pred_mask, image_path, work_dir):
    visualizer = Visualizer()
    img = cv2.imread(image_path)
    visualizer.set_image(img)
    visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
    visual_result = visualizer.get_image()

    output_path = os.path.join(work_dir, os.path.basename(image_path))
    cv2.imwrite(output_path, visual_result)

if __name__ == "__main__":
    cfg = parse_args()
    model_path = cfg.model_path

    if torch.cuda.is_available():
        if torch.cuda.get_device_properties(0).major >= 8:
            compute_dtype = torch.bfloat16
            attn_impl = "flash_attention_2" # Can use Flash Attention
            print("Detected Ampere+ GPU, using bfloat16 and Flash Attention 2.")
        else:
            compute_dtype = torch.float16
            attn_impl = "eager" # <--- IMPORTANT: Use eager for older GPUs
            print(f"Detected older GPU (compute capability < 8), using float16 and eager attention.")
    else:
        compute_dtype = torch.float32
        attn_impl = "eager"
        print("CUDA not available. Using CPU for computation and eager attention. This might be very slow.")


    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=compute_dtype,
        device_map="auto",
        trust_remote_code=True,
        offload_buffers=True,
        attn_implementation=attn_impl
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        trust_remote_code=True
    )

    image_files = []
    image_paths = []
    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}
    for filename in sorted(list(os.listdir(cfg.image_folder))):
        if os.path.splitext(filename)[1].lower() in image_extensions:
            image_files.append(filename)
            image_paths.append(os.path.join(cfg.image_folder, filename))

    vid_frames = []
    for img_path in image_paths:
        img = Image.open(img_path).convert('RGB')
        vid_frames.append(img)


    if cfg.select > 0:
        img_frame = vid_frames[cfg.select - 1]

        print(f"Selected frame {cfg.select}")
        print(f"The input is:\n{cfg.text}")
        result = model.predict_forward(
            image=img_frame,
            text=cfg.text,
            tokenizer=tokenizer,
        )
        prediction = result['prediction']
        print(f"The output is:\n{prediction}")

        if '[SEG]' in prediction and Visualizer is not None:
            _seg_idx = 0
            pred_masks = result['prediction_masks'][_seg_idx]
            if pred_masks.ndim == 3: # If multiple masks are returned (e.g., for video)
                 # Assuming the first dimension is the batch/frame dimension
                 for frame_idx in range(len(vid_frames)):
                    if frame_idx < pred_masks.shape[0]: # Ensure we don't go out of bounds
                         pred_mask = pred_masks[frame_idx]
                         if cfg.work_dir:
                            os.makedirs(cfg.work_dir, exist_ok=True)
                            visualize(pred_mask, image_paths[frame_idx], cfg.work_dir)
                         else:
                            os.makedirs('./temp_visualize_results', exist_ok=True)
                            visualize(pred_mask, image_paths[frame_idx], './temp_visualize_results')
            elif pred_masks.ndim == 2: # If a single mask is returned
                 # Use the single mask for the selected frame
                 pred_mask = pred_masks
                 if cfg.work_dir:
                    os.makedirs(cfg.work_dir, exist_ok=True)
                    visualize(pred_mask, image_paths[cfg.select -1], cfg.work_dir)
                 else:
                    os.makedirs('./temp_visualize_results', exist_ok=True)
                    visualize(pred_mask, image_paths[cfg.select - 1], './temp_visualize_results')
            else:
                print("Unexpected prediction_masks dimensions.")


    else:
        # If not selecting a single frame, pass the first frame as a single image
        # as the error suggests predict_forward might not handle a list here.
        img_frame = vid_frames[0]
        print("##########", vid_frames[0])
        print(f"The input is:\n{cfg.text}")
        result = model.predict_forward(
            image=img_frame,
            text=cfg.text,
            tokenizer=tokenizer,
        )
        prediction = result['prediction']
        print(f"The output is:\n{prediction}")

        if '[SEG]' in prediction and Visualizer is not None:
            _seg_idx = 0
            pred_masks = result['prediction_masks'][_seg_idx]
            # If only one image was processed, pred_masks should have shape [1, H, W] or [H, W]
            # We need to iterate over the original number of frames to visualize results for each
            # Assuming the single predicted mask should be applied to all frames in this mode
            if pred_masks.ndim == 3:
                 single_pred_mask = pred_masks[0] # Take the first mask if multiple are returned
            elif pred_masks.ndim == 2:
                 single_pred_mask = pred_masks # Use the mask directly if it's 2D
            else:
                 print("Unexpected prediction_masks dimensions when not selecting a frame.")
                 single_pred_mask = None

            if single_pred_mask is not None:
                 for frame_idx in range(len(vid_frames)):
                     if cfg.work_dir:
                        os.makedirs(cfg.work_dir, exist_ok=True)
                        visualize(single_pred_mask, image_paths[frame_idx], cfg.work_dir)
                     else:
                        os.makedirs('./temp_visualize_results', exist_ok=True)
                        visualize(single_pred_mask, image_paths[frame_idx], './temp_visualize_results')
        else:
            pass

Overwriting demo.py


In [3]:
!pip uninstall flash-attn -y
!pip cache purge # Clear pip's cache to ensure fresh download/build
# Make sure you've confirmed torch is 2.6.0+cu124
# If you get an error that nvcc is not found, you might need to install build essentials,
# but usually Colab has it.
!pip install flash-attn==2.7.4.post1 --no-build-isolation --verbose

Found existing installation: flash_attn 2.8.0.post2
Uninstalling flash_attn-2.8.0.post2:
  Successfully uninstalled flash_attn-2.8.0.post2
Files removed: 149
Using pip 24.1.2 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)
Collecting flash-attn==2.7.4.post1
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command python setup.py egg_info
  /usr/local/lib/python3.11/dist-packages/setuptools/__init__.py:94: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  !!

          ********************************************************************************
          Requirements should be satisfied by a PEP 517 installer.
          If you are using pip, you can try `pip install --use-pep517`.
          ********************************************************************************

  !!
    dist.fetch_build_egg

In [4]:
!python demo.py --image_folder demo_images --text "<image>Please segment the left chimney."

Detected Ampere+ GPU, using bfloat16 and Flash Attention 2.
config.json: 100% 5.25k/5.25k [00:00<00:00, 28.7MB/s]
configuration_earthmind_chat.py: 100% 4.35k/4.35k [00:00<00:00, 25.6MB/s]
configuration_phi3.py: 100% 10.4k/10.4k [00:00<00:00, 50.7MB/s]
A new version of the following files was downloaded from https://huggingface.co/sy1998/EarthMind-4B:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
configuration_internlm2.py: 100% 7.00k/7.00k [00:00<00:00, 35.1MB/s]
A new version of the following files was downloaded from https://huggingface.co/sy1998/EarthMind-4B:
- configuration_internlm2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
configuration_intern_vit.py: 100% 5.55k/5.55k [00:00<00:00, 37.2MB/s]
A new version of the following files was downloaded fr

In [5]:
!python demo.py --image_folder demo_images --text "<image>Please segment the red house. Tell about risks related with floods and moisture and perform a deep environmental analysis."

Detected Ampere+ GPU, using bfloat16 and Flash Attention 2.
Loading checkpoint shards: 100% 4/4 [00:04<00:00,  1.23s/it]
########## <PIL.Image.Image image mode=RGB size=1024x768 at 0x7A6486290990>
The input is:
<image>Please segment the red house. Tell about risks related with floods and moisture and perform a deep environmental analysis.
propagate in video: 100% 1/1 [00:00<00:00, 5497.12it/s]
The output is:
Sure, [SEG].<|end|>


In [6]:
!python demo.py --image_folder demo_images --text "<image>Tell about risks related with floods and moisture and perform a deep environmental analysis for the red house."

Detected Ampere+ GPU, using bfloat16 and Flash Attention 2.
Loading checkpoint shards: 100% 4/4 [00:04<00:00,  1.10s/it]
########## <PIL.Image.Image image mode=RGB size=1024x768 at 0x78EB2D53A5D0>
The input is:
<image>Tell about risks related with floods and moisture and perform a deep environmental analysis for the red house.
The output is:
Based on the image provided, it can be observed that there are multiple harbors and ships present in the water. This suggests that the area is likely a coastal or waterfront region with a significant amount of maritime activity. 

In terms of risks related to floods and moisture, the presence of multiple harbors and ships in the water indicates that the area may be prone to flooding, especially during heavy rainfall or storm surges. Additionally, the proximity of the harbors and ships to each other suggests that there may be a risk of flooding or water overflow affecting multiple areas simultaneously.

As for the red house, it is located at the top