Merge pull request #8 from yzhou359/main

Fix bugs for cartoon animation + wine solution for linux
adobe-research · Feb 23, 2021 · 5d367e9 · 5d367e9
2 parents 0a8d780 + 3600eb3
commit 5d367e9
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -50,6 +50,15 @@ sudo apt-get install ffmpeg
 ```
 pip install -r requirements.txt
 ```
+- `winehq-stable` for cartoon face warping in Ubuntu (https://wiki.winehq.org/Ubuntu). Tested on Ubuntu16.04, wine==5.0.3.
+```
+sudo dpkg --add-architecture i386
+wget -nc https://dl.winehq.org/wine-builds/winehq.key
+sudo apt-key add winehq.key
+sudo apt-add-repository 'deb https://dl.winehq.org/wine-builds/ubuntu/ xenial main'
+sudo apt update
+sudo apt install --install-recommends winehq-stable
+```
 
 ## Pre-trained Models
 
@@ -96,9 +105,14 @@ to amply lip motion (in x/y-axis direction) and head motion displacements, defau
 | Image  | ![img](examples_cartoon/wilk_fullbody.jpg)  | ![img](examples_cartoon/roy_full.png)  | ![img](examples_cartoon/sketch.png)  | ![img](examples_cartoon/color.jpg)  | ![img](examples_cartoon/cartoonM.png)  | ![img](examples_cartoon/danbooru1.jpg)  |
 
 ```
-python main_end2end_cartoon.py --jpg <cartoon_puppet_name>
+python main_end2end_cartoon.py --jpg <cartoon_puppet_name_with_extension> --jpg_bg <puppet_background_with_extension>
 ```
 
+- `--jpg_bg` takes a same-size image as the background image to create the animation, such as the puppet's body, the overall fixed background image. If you want to use the background, make sure the puppet face image (i.e. `--jpg` image) is in `png` format and is transparent on the non-face area. If you don't need any background, please also create a same-size image (e.g. a pure white image) to hold the argument place.
+
+- use addition args `--amp_lip_x <x> --amp_lip_y <y> --amp_pos <pos>` 
+to amply lip motion (in x/y-axis direction) and head motion displacements, default values are `<x>=2., <y>=2., <pos>=.5`
+
 - create your own puppets (ToDo...)
 
 ## Train
@@ -130,8 +144,9 @@ Todo...
 
 We would like to thank Timothy Langlois for the narration, and
 [Kaizhi Qian](https://scholar.google.com/citations?user=uEpr4C4AAAAJ&hl=en) 
-for the help with the [voice conversion module](https://auspicious3000.github.io/icassp-2020-demo/). We
-thank Daichi Ito for sharing the caricature image and Dave Werner
+for the help with the [voice conversion module](https://auspicious3000.github.io/icassp-2020-demo/). 
+We thank [Jakub Fiser](https://research.adobe.com/person/jakub-fiser/) for implementing the real-time GPU version of the triangle morphing algorithm. 
+We thank Daichi Ito for sharing the caricature image and Dave Werner
 for Wilk, the gruff but ultimately lovable puppet. 
 
 This research is partially funded by NSF (EAGER-1942069)

diff --git a/examples_cartoon/wilk_bg.jpg b/examples_cartoon/wilk_bg.jpg
diff --git a/main_end2end_cartoon.py b/main_end2end_cartoon.py
@@ -21,10 +21,12 @@
 GEN_AUDIO = True
 GEN_FLS = True
 
-DEMO_CH = 'danbooru1'
+DEMO_CH = 'wilk.png'
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--jpg', type=str, default=DEMO_CH)
+parser.add_argument('--jpg', type=str, required=True, help='Puppet image name to animate (with filename extension), e.g. wilk.png')
+parser.add_argument('--jpg_bg', type=str, required=True, help='Puppet image background (with filename extension), e.g. wilk_bg.jpg')
+parser.add_argument('--out', type=str, default='out.mp4')
 
 parser.add_argument('--load_AUTOVC_name', type=str, default='examples/ckpt/ckpt_autovc.pth')
 parser.add_argument('--load_a2l_G_name', type=str, default='examples/ckpt/ckpt_speaker_branch.pth') #ckpt_audio2landmark_g.pth') #
@@ -33,8 +35,8 @@
 
 parser.add_argument('--amp_lip_x', type=float, default=2.0)
 parser.add_argument('--amp_lip_y', type=float, default=2.0)
-parser.add_argument('--amp_pos', type=float, default=0.8)
-parser.add_argument('--reuse_train_emb_list', default=['45hn7-LXDX8']) #  ['E_kmpT-EfOg']) #  ['E_kmpT-EfOg']) # ['45hn7-LXDX8'])
+parser.add_argument('--amp_pos', type=float, default=0.5)
+parser.add_argument('--reuse_train_emb_list', type=str, nargs='+', default=[]) #  ['E_kmpT-EfOg']) #  ['E_kmpT-EfOg']) # ['45hn7-LXDX8'])
 
 
 parser.add_argument('--add_audio_in', default=False, action='store_true')
@@ -61,17 +63,25 @@
 
 opt_parser = parser.parse_args()
 
-DEMO_CH = opt_parser.jpg
+DEMO_CH = opt_parser.jpg.split('.')[0]
 
-shape_3d = np.loadtxt('examples_cartoon/{}_face_close_mouth.txt'.format(opt_parser.jpg))
+shape_3d = np.loadtxt('examples_cartoon/{}_face_close_mouth.txt'.format(DEMO_CH))
 
 ''' STEP 3: Generate audio data as input to audio branch '''
 au_data = []
+au_emb = []
 ains = glob.glob1('examples', '*.wav')
+ains = [item for item in ains if item is not 'tmp.wav']
 ains.sort()
 for ain in ains:
     os.system('ffmpeg -y -loglevel error -i examples/{} -ar 16000 examples/tmp.wav'.format(ain))
     shutil.copyfile('examples/tmp.wav', 'examples/{}'.format(ain))
+
+    # au embedding
+    from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
+    me, ae = get_spk_emb('examples/{}'.format(ain))
+    au_emb.append(me.reshape(-1))
+
     print('Processing audio file', ain)
     c = AutoVC_mel_Convertor('examples')
     au_data_i = c.convert_single_wav_to_autovc_input(audio_filename=os.path.join('examples', ain),
@@ -112,7 +122,10 @@
 ''' STEP 4: RUN audio->landmark network'''
 from src.approaches.train_audio2landmark import Audio2landmark_model
 model = Audio2landmark_model(opt_parser, jpg_shape=shape_3d)
-model.test()
+if(len(opt_parser.reuse_train_emb_list) == 0):
+    model.test(au_emb=au_emb)
+else:
+    model.test(au_emb=None)
 print('finish gen fls')
 
 ''' STEP 5: de-normalize the output to the original image scale '''
@@ -176,27 +189,40 @@
 
     os.remove(os.path.join('examples_cartoon', fls_names[i]))
 
-    # # ==============================================
-    # # Step 4 : Vector art morphing (only work in WINDOWS)
-    # # ==============================================
-    # warp_exe = os.path.join(os.getcwd(), 'facewarp', 'facewarp.exe')
-    # import os
-    #
-    # if (os.path.exists(os.path.join(output_dir, 'output'))):
-    #     shutil.rmtree(os.path.join(output_dir, 'output'))
-    # os.mkdir(os.path.join(output_dir, 'output'))
-    # os.chdir('{}'.format(os.path.join(output_dir, 'output')))
-    # print(os.getcwd())
-    #
-    # os.system('{} {} {} {} {} {}'.format(
-    #     warp_exe,
-    #     os.path.join('examples_cartoon', DEMO_CH+'.png'),
-    #     os.path.join(output_dir, 'triangulation.txt'),
-    #     os.path.join(output_dir, 'reference_points.txt'),
-    #     os.path.join(output_dir, 'warped_points.txt'),
-    #     # os.path.join(ROOT_DIR, 'puppets', sys.argv[6]),
-    #     '-novsync -dump'))
-    # os.system('ffmpeg -y -r 62.5 -f image2 -i "%06d.tga" -i {} -shortest {}'.format(
-    #     ain,
-    #     os.path.join(output_dir, sys.argv[8])
-    # ))
+    # ==============================================
+    # Step 4 : Vector art morphing
+    # ==============================================
+    warp_exe = os.path.join(os.getcwd(), 'facewarp', 'facewarp.exe')
+    import os
+
+    if (os.path.exists(os.path.join(output_dir, 'output'))):
+        shutil.rmtree(os.path.join(output_dir, 'output'))
+    os.mkdir(os.path.join(output_dir, 'output'))
+    os.chdir('{}'.format(os.path.join(output_dir, 'output')))
+    cur_dir = os.getcwd()
+    print(cur_dir)
+
+    if(os.name == 'nt'): 
+        ''' windows '''
+        os.system('{} {} {} {} {} {}'.format(
+            warp_exe,
+            os.path.join(cur_dir, '..', '..', opt_parser.jpg),
+            os.path.join(cur_dir, '..', 'triangulation.txt'),
+            os.path.join(cur_dir, '..', 'reference_points.txt'),
+            os.path.join(cur_dir, '..', 'warped_points.txt'),
+            os.path.join(cur_dir, '..', '..', opt_parser.jpg_bg),
+            '-novsync -dump'))
+    else:
+        ''' linux '''
+        os.system('wine {} {} {} {} {} {}'.format(
+            warp_exe,
+            os.path.join(cur_dir, '..', '..', opt_parser.jpg),
+            os.path.join(cur_dir, '..', 'triangulation.txt'),
+            os.path.join(cur_dir, '..', 'reference_points.txt'),
+            os.path.join(cur_dir, '..', 'warped_points.txt'),
+            os.path.join(cur_dir, '..', '..', opt_parser.jpg_bg),
+            '-novsync -dump'))
+    os.system('ffmpeg -y -r 62.5 -f image2 -i "%06d.tga" -i {} -pix_fmt yuv420p -vf "pad=ceil(iw/2)*2:ceil(ih/2)*2" -shortest -strict -2 {}'.format(
+        os.path.join(cur_dir, '..', '..', '..', 'examples', ain),
+        os.path.join(cur_dir, '..', 'out.mp4')
+    ))
diff --git a/quick_demo.ipynb b/quick_demo.ipynb
@@ -6,7 +6,8 @@
       "name": "quick_demo.ipynb",
       "provenance": [],
       "collapsed_sections": [],
-      "authorship_tag": "ABX9TyPYmqKJqHGxAbsAVY62zkIy",
+      "toc_visible": true,
+      "authorship_tag": "ABX9TyOYW4P15IPg+x69aFu7awQb",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -53,7 +54,7 @@
       "cell_type": "code",
       "metadata": {
         "id": "yB-ixde4R3nO",
-        "outputId": "ec2f71e3-66c3-4af9-cd6c-6dce38958f3c",
+        "outputId": "3014143b-2a49-439a-ce4a-54e9aa9589e7",
         "colab": {
           "base_uri": "https://localhost:8080/"
         }
@@ -68,15 +69,15 @@
         {
           "output_type": "stream",
           "text": [
-            "Sat Nov  7 02:37:34 2020       \n",
+            "Tue Nov 10 19:18:06 2020       \n",
             "+-----------------------------------------------------------------------------+\n",
             "| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |\n",
             "|-------------------------------+----------------------+----------------------+\n",
             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
             "|===============================+======================+======================|\n",
-            "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
-            "| N/A   58C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |\n",
+            "|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |\n",
+            "| N/A   40C    P8     7W /  75W |      0MiB /  7611MiB |      0%      Default |\n",
             "+-------------------------------+----------------------+----------------------+\n",
             "                                                                               \n",
             "+-----------------------------------------------------------------------------+\n",
@@ -111,7 +112,7 @@
       "source": [
         "print(subprocess.getoutput('ffmpeg'))"
       ],
-      "execution_count": 2,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -158,7 +159,7 @@
       "source": [
         "!git clone https://github.com/yzhou359/MakeItTalk"
       ],
-      "execution_count": 3,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -193,7 +194,7 @@
         "!pip install -r requirements.txt\n",
         "!pip install tensorboardX"
       ],
-      "execution_count": 4,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -279,7 +280,7 @@
         "!gdown -O examples/ckpt/ckpt_116_i2i_comb.pth https://drive.google.com/uc?id=1i2LJXKp-yWKIEEgJ7C6cE3_2NirfY_0a\n",
         "!gdown -O examples/dump/emb.pickle https://drive.google.com/uc?id=18-0CYl5E6ungS3H4rRSHjfYvvm-WwjTI"
       ],
-      "execution_count": 5,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -348,14 +349,14 @@
         "import torch\n",
         "import pickle\n",
         "import face_alignment\n",
-        "from thirdparty.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n",
+        "from src.autovc.AutoVC_mel_Convertor_retrain_version import AutoVC_mel_Convertor\n",
         "import shutil\n",
         "import time\n",
         "import util.utils as util\n",
         "from scipy.signal import savgol_filter\n",
         "from src.approaches.train_audio2landmark import Audio2landmark_model"
       ],
-      "execution_count": 6,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -380,7 +381,7 @@
         "AMP_LIP_SHAPE_Y = 2.                 # amplify the lip motion in vertical direction\n",
         "AMP_HEAD_POSE_MOTION = 0.7           # amplify the head pose motion (usually smaller than 1.0, put it to 0. for a static head pose)"
       ],
-      "execution_count": 63,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -435,7 +436,7 @@
         "\n",
         "opt_parser = parser.parse_args()"
       ],
-      "execution_count": 64,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -464,7 +465,7 @@
         "if(opt_parser.close_input_face_mouth):\n",
         "    util.close_input_face_mouth(shape_3d)"
       ],
-      "execution_count": 65,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -491,7 +492,7 @@
         "shape_3d[[37,38,43,44], 1] -=2.    # larger eyes\n",
         "shape_3d[[40,41,46,47], 1] +=2.    # larger eyes"
       ],
-      "execution_count": 66,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -511,7 +512,7 @@
       "source": [
         "shape_3d, scale, shift = util.norm_input_face(shape_3d)"
       ],
-      "execution_count": 67,
+      "execution_count": null,
       "outputs": []
     },
     {
@@ -584,7 +585,7 @@
         "    gaze = {'rot_trans':rot_tran, 'rot_quat':rot_quat, 'anchor_t_shape':anchor_t_shape}\n",
         "    pickle.dump(gaze, fp)"
       ],
-      "execution_count": 68,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -626,7 +627,7 @@
         "else:\n",
         "    model.test(au_emb=None)"
       ],
-      "execution_count": 69,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -770,7 +771,7 @@
         "        print('finish image2image gen')\n",
         "    os.remove(os.path.join('examples', fls[i]))"
       ],
-      "execution_count": 70,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -822,7 +823,7 @@
         "  </video>\n",
         "  \"\"\" % data_url))"
       ],
-      "execution_count": 71,
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
@@ -859,7 +860,7 @@
       "source": [
         ""
       ],
-      "execution_count": 71,
+      "execution_count": null,
       "outputs": []
     }
   ]