<a href="https://colab.research.google.com/github/antoniomuso/speech2face/blob/master/Speech2Face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# path = "/content/drive/My Drive/Speech2Face/vox"
# !curl --user voxceleb1912:0s42xuw6 -o "/content/drive/My Drive/Speech2Face/ff/vox.zip" http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip



! cp "/content/drive/My Drive/Speech2Face/vox1_dataset/vox_audios/vox.zip" /content
! cp "/content/drive/My Drive/Speech2Face/zippedFaces.tar.gz" /content
! cp "/content/drive/My Drive/Speech2Face/vox1_dataset/vox1_meta.csv" /content

! tar zxvf zippedFaces.tar.gz
! unzip vox.zip



In [None]:
# !unzip "/content/drive/My Drive/Speech2Face/ff/vox.zip" -d "/content/drive/My Drive/Speech2Face/ff/ext/"

In [None]:
import librosa
import numpy as np
import pandas as pd
from os import listdir
from os.path import join
from torch.utils.data import Dataset
import glob
import itertools

device = 'cuda'

In [None]:
wav, sr = librosa.load('/content/drive/My Drive/Speech2Face/vox1_dataset/vox_audios/ext/wav/id10270/OhfKF8FSq3Y/00003.wav',sr = 16000, duration = 6.0 ,mono = True) 

In [None]:
! ls -l "wav" | wc -l 
#spectro = librosa.core.stft(wav, n_fft = 512, hop_length = int(np.ceil(0.01 * sr)),win_length = int(np.ceil(0.025 * sr)) , window='hann', center=True,pad_mode='reflect')

In [None]:


meta = pd.read_csv('/content/drive/My Drive/Speech2Face/vox1_dataset/vox1_meta.csv',sep='\t')

meta = meta.drop('Gender',axis=1)
meta = meta.drop('Nationality',axis=1)
meta = meta.drop('Set',axis=1)
meta

In [None]:
def get_map_person2paths(path, format='wav'):
  actor2data = dict()
  
  for person in listdir(path):
    n_path = join(path, person)
    files = glob.glob(n_path + '/**/*.'+format, recursive=True)
    actor2data[person] = files
  
  return actor2data

def load_metadata(path):
  meta = pd.read_csv(path,sep='\t')

  meta = meta.drop('Gender',axis=1)
  meta = meta.drop('Nationality',axis=1)
  meta = meta.drop('Set',axis=1)
  return meta

def couple_data(voice_map, face_map, meta):
  count = 0
  out = []
  for index, row in meta.iterrows():
    if (row['VoxCeleb1 ID'] not in voice_map.keys()) or (row['VGGFace1 ID'] not in face_map.keys()):
      count += 1
      continue
    # max(len(voice_map[row['VoxCeleb1 ID']]), face_map[row['VGGFace1 ID']])
    coupled = list(zip(voice_map[row['VoxCeleb1 ID']], face_map[row['VGGFace1 ID']]))
    out += coupled
  
  print("elements not found:", count)
  return out

def create_coupled_list(path_voices, path_faces, metaP):
  voice_map = get_map_person2paths(path_voices)
  face_map = get_map_person2paths(path_faces, 'jpg')
  meta = load_metadata(metaP)
  return couple_data(voice_map, face_map, meta)

class Dataloader(Dataset):
  def __init__(self, path_voices, path_faces, metaP, device, size=64):
        super().__init__()
        self.path_voices = path_voices
        self.path_faces = path_faces
        self.size = size
        self.coupled_list = create_coupled_list(path_voices, path_faces, metaP)
        self.len = len(self.coupled_list)

  def __len__(self):
        return self.len
    
  def __getitem__(self, idx):
        pass

data = Dataloader('wav', 'unzippedFaces','vox1_meta.csv', device)


In [None]:





def adjust(stft):
  if stft.shape[1] == 601:
    return stft
  else:
    return np.concatenate((stft,stft[:,0:601 - stft.shape[1]]),axis = 1)

spectroComplex = adjust(spectro)
converted = np.zeros((spectroComplex.shape[0], spectroComplex.shape[1], 2))
i = np.arange(spectroComplex.shape[0])
j = np.arange(spectroComplex.shape[1])

converted[i,j[:,np.newaxis], 0] = spectroComplex[i,j[:,np.newaxis]].real
converted[i,j[:,np.newaxis], 1] = spectroComplex[i,j[:,np.newaxis]].imag

In [None]:
import torch
import torch.nn as nn

class SpeechEncoder(nn.Module):

    def __init__(self):
        super(SpeechEncoder, self).__init__()
        self.conv1 = nn.Conv2d(2, 64, kernel_size=4,stride=1) 
        self.conv2 = nn.Conv2d(64, 64, kernel_size=4,stride=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=4,stride=1) 
        self.pooling1 = nn.MaxPool2d(kernel_size=(2,1), stride=(2,1))
        self.conv4 = nn.Conv2d(128, 128, kernel_size=4,stride=1) 
        self.pooling2 = nn.MaxPool2d(kernel_size=(2,1), stride=(2,1))
        self.conv5 = nn.Conv2d(128, 128, kernel_size=4,stride=1) 
        self.pooling3 = nn.MaxPool2d(kernel_size=(2,1), stride=(2,1))
        self.conv6 = nn.Conv2d(128, 256, kernel_size=4,stride=1) 
        self.pooling4 = nn.MaxPool2d(kernel_size=(2,1), stride=(2,1))
        self.conv7 = nn.Conv2d(256, 512, kernel_size=4,stride=1) 
        self.conv8 = nn.Conv2d(512, 512, kernel_size=4,stride=2) 

        self.conv9 = nn.Conv2d(512, 512, kernel_size=3,stride=2) # Queste due celle sono diverse
        self.pooling5 = nn.AvgPool2d(kernel_size=(1,1), stride=1)# Queste due celle sono diverse

        self.fc1 = nn.Linear(512 * 1 * 144, 4096)
        self.fc2 = nn.Linear(4096, 4096)

        self.batch_norm1 = nn.BatchNorm2d(64)
        self.batch_norm2 = nn.BatchNorm2d(64)
        self.batch_norm3 = nn.BatchNorm2d(128)
        self.batch_norm4 = nn.BatchNorm2d(128)
        self.batch_norm5 = nn.BatchNorm2d(128)
        self.batch_norm6 = nn.BatchNorm2d(256)
        self.batch_norm7 = nn.BatchNorm2d(512)
        self.batch_norm8 = nn.BatchNorm2d(512)
        self.batch_norm9 = nn.BatchNorm2d(512)
      


        self.relu = nn.ReLU()
        

    def forward(self, x):
        out = self.batch_norm1(self.relu(self.conv1(x)))
        out = self.batch_norm2(self.relu(self.conv2(out)))
        out = self.batch_norm3(self.relu(self.conv3(out)))
        out = self.pooling1(out)
        out = self.batch_norm4(self.relu(self.conv4(out)))
        out = self.pooling2(out)
        out = self.batch_norm5(self.relu(self.conv5(out)))
        out = self.pooling3(out)
        out = self.batch_norm6(self.relu(self.conv6(out)))
        out = self.pooling4(out)
        out = self.batch_norm7(self.relu(self.conv7(out)))
        out = self.batch_norm8(self.relu(self.conv8(out)))
        out = self.batch_norm9(self.relu(self.pooling5(self.conv9(out))))

        batch = out.shape[0]
        out = out.view((batch, 512 * 1 * 144))
        out = self.relu(self.fc1(out))
        out = self.fc2(out)

        return out



In [None]:
from torchsummary import summary


model = SpeechEncoder()
input = torch.unsqueeze(torch.tensor(converted).reshape(2,257,601), 0)

model(input.type(torch.float32)).shape
summary(model, (2,257,601))