In [2]:
import os
import argparse
from datetime import datetime

import torch
import numpy as np
import cv2

from seggpt_engine import inference_image, inference_video, inference_video_by_image
import models_seggpt
from PIL import Image
from IPython.display import Image

imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std = np.array([0.229, 0.224, 0.225])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# モデルをロード
def prepare_model(chkpt_dir, arch='seggpt_vit_large_patch16_input896x448', seg_type='instance'):
    # build model
    model = getattr(models_seggpt, arch)()
    model.seg_type = seg_type
    # load model
    checkpoint = torch.load(chkpt_dir, map_location='cpu')
    msg = model.load_state_dict(checkpoint['model'], strict=False)
    model.eval()
    return model

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = 'seggpt_vit_large_patch16_input896x448'
checkpoint = 'seggpt_vit_large.pth'


model = prepare_model(checkpoint, model).to(device)
print('Model loaded.')

Model loaded.


## 一枚の画像をセグメンテーション

処理の流れイメージ画像をはる

In [8]:
input_image_path = 'images/inputs/penguin_input.jpeg'
prompt_image_path = 'images/inputs/penguin_target.jpeg'
prompt_target_path = 'images/masks/penguin.png'
out_dir = 'output/'

now = datetime.now().strftime('%Y%m%d_%H%M%S')
# 新しいディレクトリのパス
new_out_dir = os.path.join(out_dir, now)

if not os.path.exists(new_out_dir):
    os.makedirs(new_out_dir)

img_name = os.path.basename(input_image_path)
out_path = os.path.join(new_out_dir, "output_" + '.'.join(img_name.split('.')[:-1]) + '.png')
print(out_path)

inference_image(model, device, input_image_path, prompt_image_path, prompt_target_path, out_path)
Image(out_path)

output/20240613_063932/output_penguin_input.png


## 動画をセグメンテーション

プロンプト画像を用いてセグメンテーション

処理の流れイメージ画像をはる

In [4]:
input_video_path = 'images/inputs/duck.mp4'
prompt_image = Image.open('images/inputs/duck.jpeg').convert("RGB")
prompt_target = Image.open('images/masks/duck.png').convert("RGB")
out_dir = 'output/'

num_frames = 1 # number of prompt frames in video

now = datetime.now().strftime('%Y%m%d_%H%M%S')
new_out_dir = os.path.join(out_dir, now)

if not os.path.exists(new_out_dir):
    os.makedirs(new_out_dir)

img_name = os.path.basename(input_video_path)
out_path = os.path.join(new_out_dir, "output_" + '.'.join(img_name.split('.')[:-1]) + '.mp4')
print(out_path)

cap = cv2.VideoCapture(input_video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height), True)

while True:
    ret, frame = cap.read()
    if not ret:
        break
    input_img = Image.fromarray(frame[:, :, ::-1]).convert('RGB')
    mask, output = inference_video_by_image(model, device, input_img, [prompt_image], [prompt_target], out_path)
    video_writer.write(np.ascontiguousarray(output.astype(np.uint8)[:, :, ::-1]))

video_writer.release()


output/20240613_084539/output_duck.mp4


動画の初めのフレームをプロンプトとしてセグメンテーション

処理の流れイメージ画像をはる

In [3]:
# 画像をもとに動画セグメンテーションする場合

input_video_path = 'images/inputs/knife.mp4'
prompt_image = Image.open('images/inputs/knife.png').convert("RGB")
prompt_target = Image.open('images/masks/knife.png').convert("RGB")
out_dir = 'output/'


now = datetime.now().strftime('%Y%m%d_%H%M%S')
new_out_dir = os.path.join(out_dir, now)

if not os.path.exists(new_out_dir):
    os.makedirs(new_out_dir)

img_name = os.path.basename(input_video_path)
out_path = os.path.join(new_out_dir, "output_" + '.'.join(img_name.split('.')[:-1]) + '.mp4')
print(out_path)

cap = cv2.VideoCapture(input_video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height), True)

first_flg = True

while True:
    ret, frame = cap.read()
    if not ret:
        break
    input_img = Image.fromarray(frame[:, :, ::-1]).convert('RGB')
    if first_flg:
        mask, output = inference_video_by_image(model, device, input_img, [prompt_image], [prompt_target], out_path)
        first_flg = False
    else:
        mask, output = inference_video_by_image(model, device, input_img, [prompt_image, pre_image], [prompt_target, mask], out_path)
    pre_image = input_img
    video_writer.write(np.ascontiguousarray(output.astype(np.uint8)[:, :, ::-1]))

video_writer.release()

output/20240613_082440/output_knife.mp4


## カラーマスクの使用

In [6]:
input_video_path = 'images/inputs/dogs.mp4'
prompt_image = Image.open('images/inputs/dogs.png').convert("RGB")
prompt_target = Image.open('images/masks/dogs.png').convert("RGB")
out_dir = 'output/'


now = datetime.now().strftime('%Y%m%d_%H%M%S')
new_out_dir = os.path.join(out_dir, now)

if not os.path.exists(new_out_dir):
    os.makedirs(new_out_dir)

img_name = os.path.basename(input_video_path)
out_path = os.path.join(new_out_dir, "output_" + '.'.join(img_name.split('.')[:-1]) + '.mp4')
print(out_path)

cap = cv2.VideoCapture(input_video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height), True)

first_flg = True

while True:
    ret, frame = cap.read()
    if not ret:
        break
    input_img = Image.fromarray(frame[:, :, ::-1]).convert('RGB')
    if first_flg:
        mask, output = inference_video_by_image(model, device, input_img, [prompt_image], [prompt_target], out_path)
        first_flg = False
    else:
        mask, output = inference_video_by_image(model, device, input_img, [prompt_image, pre_image], [prompt_target, mask], out_path)
    pre_image = input_img
    video_writer.write(np.ascontiguousarray(output.astype(np.uint8)[:, :, ::-1]))

video_writer.release()


output/20240613_071309/output_dog_min.mp4


In [4]:
from moviepy.editor import ImageSequenceClip

def create_video_from_images(image_folder, output_video_path, fps=24):
    # 画像フォルダ内のファイル名を取得してソート
    images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(".png") or img.endswith(".jpg")])

    # 画像から動画クリップを作成
    clip = ImageSequenceClip(images, fps=fps)
    
    # 動画を出力
    clip.write_videofile(output_video_path, codec="libx264")

image_folder = 'output/commandline/result'
output_video_path = 'output/commandline/dog_result.mp4'
create_video_from_images(image_folder, output_video_path)


Moviepy - Building video output/commandline/dog_result.mp4.
Moviepy - Writing video output/commandline/dog_result.mp4



                                                              

Moviepy - Done !
Moviepy - video ready output/commandline/dog_result.mp4
