# 多媒体处理脚本 DEMO

## Align reference image

In [None]:
# reference image aligned
import sys
from src.utils.img_utils import pil_to_cv2, cv2_to_pil, center_crop_cv2, pils_from_video, save_videos_from_pils, save_video_from_cv2_list
from PIL import Image
import cv2
from IPython import embed
import numpy as np
import copy
from src.utils.motion_utils import motion_sync
import pathlib
import torch
import pickle
from glob import glob
import os
from src.models.dwpose.dwpose_detector import dwpose_detector as dwprocessor
from src.models.dwpose.util import draw_pose
import decord
from tqdm import tqdm
from moviepy.editor import AudioFileClip, VideoFileClip
from multiprocessing.pool import ThreadPool

##################################
process_num = 100 #1266

start = 0
end = process_num + start
#################################
MAX_SIZE = 768

def convert_fps(src_path, tgt_path, tgt_fps=24, tgt_sr=16000):
    clip = VideoFileClip(src_path)
    new_clip = clip.set_fps(tgt_fps)
    if tgt_fps is not None:
        audio = new_clip.audio
        audio = audio.set_fps(tgt_sr)
        new_clip = new_clip.set_audio(audio)
    if '.mov' in tgt_path:
        tgt_path = tgt_path.replace('.mov', '.mp4')
    new_clip.write_videofile(tgt_path, codec='libx264', audio_codec='aac')
    
def get_video_pose(
        video_path: str, 
        sample_stride: int=1,
        max_frame=None):

    # read input video
    vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
    sample_stride *= max(1, int(vr.get_avg_fps() / 24))

    frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
    # print(frames[0])
    if max_frame is not None:
        frames = frames[0:max_frame,:,:]
    height, width, _ = frames[0].shape
    detected_poses = [dwprocessor(frm) for frm in frames]
    dwprocessor.release_memory()

    return detected_poses, height, width, frames

def save_pose_params_item(input_items):
    detected_pose, pose_params, draw_pose_params, save_dir = input_items
    w_min, w_max, h_min, h_max = pose_params
    num = detected_pose['num']
    candidate_body = detected_pose['bodies']['candidate']
    candidate_face = detected_pose['faces'][0]
    candidate_hand = detected_pose['hands']
    candidate_body[:,0] = (candidate_body[:,0]-w_min)/(w_max-w_min)
    candidate_body[:,1] = (candidate_body[:,1]-h_min)/(h_max-h_min)
    candidate_face[:,0] = (candidate_face[:,0]-w_min)/(w_max-w_min)
    candidate_face[:,1] = (candidate_face[:,1]-h_min)/(h_max-h_min)
    candidate_hand[:,:,0] = (candidate_hand[:,:,0]-w_min)/(w_max-w_min)
    candidate_hand[:,:,1] = (candidate_hand[:,:,1]-h_min)/(h_max-h_min)
    detected_pose['bodies']['candidate'] = candidate_body
    detected_pose['faces'] = candidate_face.reshape(1, candidate_face.shape[0], candidate_face.shape[1])
    detected_pose['hands'] = candidate_hand
    detected_pose['draw_pose_params'] = draw_pose_params
    np.save(save_dir+'/'+str(num)+'.npy', detected_pose)

def save_pose_params(detected_poses, pose_params, draw_pose_params, ori_video_path):
    save_dir = ori_video_path.replace('video', 'pose/')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    input_list = []
    
    for i, detected_pose in enumerate(detected_poses):
        input_list.append([detected_pose, pose_params, draw_pose_params, save_dir])

    pool = ThreadPool(8)
    pool.map(save_pose_params_item, input_list)
    pool.close()
    pool.join()
    return save_dir
from torchvision.transforms import functional as F
def get_img_pose(
        img_path: str, 
        sample_stride: int=1,
        max_frame=None):

  # read input img
  frame = cv2.imread(img_path)
  height, width, _ = frame.shape
  short_size = min(height, width)
  resize_ratio = max(MAX_SIZE / short_size, 1.0)
  frame = cv2.resize(frame, (int(resize_ratio * width), int(resize_ratio * height)))
  height, width, _ = frame.shape
  detected_poses = [dwprocessor(frame)]
  dwprocessor.release_memory()

  return detected_poses, height, width, frame

def save_aligned_img(ori_frame, video_params, max_size):
  h_min_real, h_max_real, w_min_real, w_max_real = video_params
  img = ori_frame[h_min_real:h_max_real,w_min_real:w_max_real,:]
  img_aligened = resize_and_pad(img, max_size=max_size)
  print('aligned img shape:', img_aligened.shape)
  save_dir = './assets/refimg_aligned'

  os.makedirs(save_dir, exist_ok=True)
  save_path = os.path.join(save_dir, 'aligned.png')
  cv2.imwrite(save_path, img_aligened)
  return save_path

detected_poses, height, width, ori_frame = get_img_pose(refimg_path, max_frame=None)
res_params = get_pose_params(detected_poses, MAX_SIZE)
refimg_aligned_path = save_aligned_img(ori_frame, res_params['video_params'], MAX_SIZE)

## Extract pose from driving video

In [None]:
if using_video_driving:
  base_dir = video_dir
  tasks = [video_name]
  visualization = False
  for sub_task in tasks:
    ori_list = os.listdir(base_dir+sub_task)
    new_dir = base_dir + sub_task+'_24fps'
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    index = 1
    for i, mp4_file in enumerate(ori_list):
      ori_video_path = base_dir + sub_task+'/'+mp4_file
      if ori_video_path[-3:]=='mp4':
        try:
          # convert to 24fps
          ori_video_path_new = ori_video_path.replace(sub_task, sub_task+'_24fps')
          if '.MOV' in ori_video_path_new:
              ori_video_path_new.replace('.MOV', '.mp4')
          convert_fps(ori_video_path, ori_video_path_new)
          # extract pose
          detected_poses, height, width, ori_frames = get_video_pose(ori_video_path_new, max_frame=None)
          # parameterize pose
          res_params = get_pose_params(detected_poses, MAX_SIZE)
          # save pose to npy
          pose_path = save_pose_params(detected_poses, res_params['pose_params'], res_params['draw_pose_params'], ori_video_path)
          
          index += 1
            
        except:
          print("extract crash!")
          continue 

    print(["All Finished", sub_task, start, end])



## Imports

In [None]:
import os
import random
from datetime import datetime
from pathlib import Path

import numpy as np
import torch
from diffusers import AutoencoderKL, DDIMScheduler
from einops import repeat
from omegaconf import OmegaConf
from PIL import Image
import sys

from decord import VideoReader
from moviepy.editor import VideoFileClip, AudioFileClip

os.environ['FFMPEG_PATH'] = './ffmpeg-4.4-amd64-static'
ffmpeg_path = os.getenv('FFMPEG_PATH')

if ffmpeg_path is None:
    print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static")
elif ffmpeg_path not in os.getenv('PATH'):
    print("add ffmpeg to path")
    os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"


## argparse使用

In [2]:
import argparse


parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default="./configs/prompts/infer.yaml")
parser.add_argument("-W", type=int, default=768)
parser.add_argument("-H", type=int, default=768)
parser.add_argument("-L", type=int, default=240)
parser.add_argument("--seed", type=int, default=3407)

parser.add_argument("--context_frames", type=int, default=12)
parser.add_argument("--context_overlap", type=int, default=3)

parser.add_argument("--cfg", type=float, default=2.5)
parser.add_argument("--steps", type=int, default=30)
parser.add_argument("--sample_rate", type=int, default=16000)
parser.add_argument("--fps", type=int, default=24)
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--ref_images_dir", type=str, default=f'./assets/halfbody_demo/refimag')
parser.add_argument("--pose_dir", type=str, default=None)
parser.add_argument("--refimg_name", type=str, default='natural_bk_openhand/0035.png')
parser.add_argument("--pose_name", type=str, default="01")
parser.add_argument("--video_dir", type=str, default="./assets/halfbody_demo/video")

args, _ = parser.parse_known_args()
print(f'{args}')

Namespace(config='./configs/prompts/infer.yaml', W=768, H=768, L=240, seed=3407, context_frames=12, context_overlap=3, cfg=2.5, steps=30, sample_rate=16000, fps=24, device='cuda', ref_images_dir='./assets/halfbody_demo/refimag', pose_dir=None, refimg_name='natural_bk_openhand/0035.png', pose_name='01', video_dir='./assets/halfbody_demo/video')


## Animating half-body human video

## 图片、视频、音频展示

In [None]:
from IPython.display import display, Video
# 播放视频：
# display(Video(filename=save_name + "_sig.mp4"))


# # # 展示图片：
from IPython.display import Image as DisImage
# display(Image(filename="path/to/image.jpg"))

# ref_img = './zlm-digital-person.git/data/frame/liveV_frames-0-at-3.920.jpg'
# display(Image(filename=ref_img))
# clip = VideoFileClip('zlm-digital-person.git/data/video/output_video_img_0_text_0.mp4') # v1输出的音频驱动人头视频
# display(Video(filename='zlm-digital-person.git/data/video/output_video_img_0_text_0.mp4'))
# for fi in clip.iter_frames():
#     print('sub frame1: ')
#     display(Image(fi))
#     break # test 

# # 播放 音频文件： 
# from IPython.display import *     
# Audio(audio_path, autoplay=True)

# # 播放视频文件的方式2： 
# from IPython.display import HTML
# def play_video(video_path):
#     # 构建 HTML5 视频标签
#     video_html = f"""
#     <video width="600" controls>
#         <source src="{video_path}" type="video/mp4">
#         您的浏览器不支持视频标签
#     </video>
#     """
#     display(HTML(video_html))

# # 示例：播放当前目录下的 sample.mp4
# play_video("outputs/aligned-a-echomimicv2_woman-i0_sig.mp4")
