In [1]:
import cv2
import torch
import time
import os

from utils.inference.image_processing import crop_face
from utils.inference.video_processing import read_video, get_final_video_frame, add_audio_from_another_video, face_enhancement, get_final_video_multi
from utils.inference.core import model_inference, model_inference_multi

from network.AEI_Net import AEI_Net
from coordinate_reg.image_infer import Handler
from insightface_func.face_detect_crop_multi import Face_detect_crop
from arcface_model.iresnet import iresnet100
from models.pix2pix_model import Pix2PixModel
from models.config_sr import TestOptions



### Load Models

In [2]:
app = Face_detect_crop(name='antelope', root='./insightface_func/models')
app.prepare(ctx_id= 0, det_thresh=0.6, det_size=(640,640))

# main model for generation
G = AEI_Net(c_id=512)
G.eval()
G.load_state_dict(torch.load('weights/G_0_035000_init_arch_arcface2.pth', map_location=torch.device('cpu')))
G = G.cuda()
G = G.half()

# arcface model to get face embedding
netArc = iresnet100(fp16=False)
netArc.load_state_dict(torch.load('arcface_model/backbone.pth'))
netArc=netArc.cuda()
netArc.eval()

# model to get face landmarks
handler = Handler('./coordinate_reg/model/2d106det', 0, ctx_id=0, det_size=640)

# model to make superres of face, set use_sr=True if you want to use super resolution or use_sr=False if you don't
use_sr = False

if use_sr:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    torch.backends.cudnn.benchmark = True
    opt = TestOptions()
    #opt.which_epoch ='10_7'
    model = Pix2PixModel(opt)
    model.netG.train()

  and should_run_async(code)


input mean and std: 127.5 127.5
find model: ./insightface_func/models/antelope/glintr100.onnx recognition
find model: ./insightface_func/models/antelope/scrfd_10g_bnkps.onnx detection
set det-size: (640, 640)
loading ./coordinate_reg/model/2d106det 0
input mean and std: 127.5 127.5
find model: ./insightface_func/models/antelope/glintr100.onnx recognition
find model: ./insightface_func/models/antelope/scrfd_10g_bnkps.onnx detection
set det-size: (640, 640)


### Set here path to source image and video for faceswap

In [14]:
"""
choose not really long videos, coz it can take a lot of time othervise 
choose source image as a photo -- preferable a selfie of a person
"""

path_to_video = 'examples/videos/video5_mod.mp4'
source_full = cv2.imread('examples/images/elon_musk.jpg')
OUT_VIDEO_NAME = "examples/results/testing_multi.mp4"
crop_size = 224 # don't change this

  and should_run_async(code)


In [15]:
# check, if we can detect face on the source image

source_images = [source_full]

source = []
try:
    for source_image in source_images:     
        source.append(crop_face(source_image, app, crop_size)[0])
except TypeError:
    print("Bad source images")

  and should_run_async(code)


In [16]:
# read video

full_frames, fps = read_video(path_to_video)

target1 = full_frames[0]
target_images = [target1]

target = []
try:
    for target_image in target_images: 
        target.append(crop_face(target_image, app, crop_size)[0])
except TypeError:
    print("Bad target images")

  and should_run_async(code)


### Model Inference

In [17]:
START_TIME = time.time()

  and should_run_async(code)


In [18]:
final_frames_list, crop_frames_list, full_frames, tfm_array_list = model_inference_multi(full_frames,
                                                                    source,
                                                                    target,
                                                                    netArc,
                                                                    G,
                                                                    app, 
                                                                    crop_size=crop_size)

  and should_run_async(code)
100%|██████████| 379/379 [00:12<00:00, 31.45it/s]
379it [00:00, 4462.55it/s]
100%|██████████| 7/7 [00:01<00:00,  3.84it/s]
100%|██████████| 379/379 [00:00<00:00, 906294.88it/s]


In [19]:
# final_frames, crop_frames, full_frames, tfm_array = model_inference(full_frames,
#                                                                     source,
#                                                                     [netArc],
#                                                                     G,
#                                                                     app, 
#                                                                     crop_size=crop_size)

  and should_run_async(code)


In [20]:
# if use_sr:
#     final_frames = face_enhancement(final_frames, model)

  and should_run_async(code)


In [21]:
get_final_video_multi(final_frames_list,
                      crop_frames_list,
                      full_frames,
                      tfm_array_list,
                      OUT_VIDEO_NAME,
                      fps, 
                      handler)

  and should_run_async(code)
100%|██████████| 379/379 [00:06<00:00, 62.64it/s]


In [22]:
# get_final_video_frame(final_frames,
#                       crop_frames,
#                       full_frames,
#                       tfm_array,
#                       OUT_VIDEO_NAME,
#                       fps, 
#                       handler)

  and should_run_async(code)


In [23]:
add_audio_from_another_video(path_to_video, OUT_VIDEO_NAME, "audio")

  and should_run_async(code)


In [24]:
print(f'Full pipeline took {time.time() - START_TIME}')
print(f"Video saved with path {OUT_VIDEO_NAME}")

Full pipeline took 23.902369499206543
Video saved with path examples/results/testing_multi.mp4


  and should_run_async(code)
