In [15]:
import os
import shutil
import numpy as np
import pandas as pd 
from tqdm import tqdm

In [2]:
def get_language(char: str) -> str:
    assert len(char) == 1
    language_dict = {'E': "ENG", "F": "FIN", "G": "GER", "M": "MAN"}
    return language_dict[char]


In [3]:
root_name = 'T16'
for dir_name in os.listdir(root_name):
    if os.path.isdir(os.path.join(root_name, dir_name)):
        for file_name in os.listdir(os.path.join(root_name, dir_name)):
            if os.path.isfile(os.path.join(root_name, dir_name, file_name)):
                shutil.copyfile(os.path.join(root_name, dir_name, file_name), os.path.join(root_name, f'{dir_name[:4]}_{dir_name[-4:]}_{file_name}'))

In [4]:
root_name = 'spoofed_voice'
method_name = 'T22'
for file_name in os.listdir(method_name):
    if os.path.isfile(os.path.join(method_name, file_name)) and file_name.endswith('.wav'):
        shutil.move(os.path.join(method_name, file_name), os.path.join(root_name, f'{method_name}_{file_name}'))


In [5]:
columns = ['-', 'utt', 'language', 'source', 'method', 'target', 'label', 'phase']
df = pd.DataFrame(columns=columns)

In [6]:
series_list = []
for file_name in os.listdir('spoofed_voice'):
    data = []
    data.append('-')
    data.append(file_name[:-4])
    data.append(get_language(file_name[10]))
    data.append(file_name[4:8]) # source
    data.append(file_name[:3]) # method
    data.append(file_name[9:13]) # target
    data.append('spoof')
    data.append('test')
    series = pd.Series(data=data, index=columns)
    series_list.append(series)

In [7]:
df = pd.concat(series_list, axis=1).transpose()
df

Unnamed: 0,-,utt,language,source,method,target,label,phase
0,-,T11_SEF1_TEF1_E20001,ENG,SEF1,T11,TEF1,spoof,test
1,-,T11_SEF1_TEF1_E20002,ENG,SEF1,T11,TEF1,spoof,test
2,-,T11_SEF1_TEF1_E20003,ENG,SEF1,T11,TEF1,spoof,test
3,-,T11_SEF1_TEF1_E20004,ENG,SEF1,T11,TEF1,spoof,test
4,-,T11_SEF1_TEF1_E20005,ENG,SEF1,T11,TEF1,spoof,test
...,...,...,...,...,...,...,...,...
2056,-,T22_SEF2_TMF1_E20012,MAN,SEF2,T22,TMF1,spoof,test
2057,-,T22_SEM2_TGM1_E20008,GER,SEM2,T22,TGM1,spoof,test
2058,-,T22_SEM2_TGF1_E20008,GER,SEM2,T22,TGF1,spoof,test
2059,-,T22_SEM1_TGF1_E20016,GER,SEM1,T22,TGF1,spoof,test


In [8]:
root_name = 'bonafide_voice'
folder_name = 'vcc2020_database_groundtruth'
for speaker in os.listdir(folder_name):
    if os.path.isdir(os.path.join(folder_name, speaker)):
        for file_name in os.listdir(os.path.join(folder_name, speaker)):
            if os.path.isfile(os.path.join(folder_name, speaker, file_name)) and file_name.endswith('.wav'):
                shutil.copy(os.path.join(folder_name, speaker, file_name), os.path.join(root_name, f'GT_{speaker}_{speaker}_{file_name}'))

In [9]:
GT_df = pd.DataFrame(columns=columns)
series_list = []
for file_name in os.listdir('bonafide_voice'):
    data = []
    data.append('-')
    data.append(file_name[:-4]) # utt
    data.append(get_language(file_name[9]))
    data.append(file_name[3:7]) # source
    data.append(file_name[:2]) # method
    data.append(file_name[8:12]) # target
    data.append('bonafide')
    data.append('test')
    series = pd.Series(data=data, index=columns)
    series_list.append(series)

In [10]:
GT_df = pd.concat(series_list, axis=1).transpose()
GT_df

Unnamed: 0,-,utt,language,source,method,target,label,phase
0,-,GT_TGM1_TGM1_E30019,GER,TGM1,GT,TGM1,bonafide,test
1,-,GT_TGM1_TGM1_E30025,GER,TGM1,GT,TGM1,bonafide,test
2,-,GT_TGM1_TGM1_E30024,GER,TGM1,GT,TGM1,bonafide,test
3,-,GT_TGM1_TGM1_E30018,GER,TGM1,GT,TGM1,bonafide,test
4,-,GT_TGM1_TGM1_E30020,GER,TGM1,GT,TGM1,bonafide,test
...,...,...,...,...,...,...,...,...
245,-,GT_TGF1_TGF1_E30014,GER,TGF1,GT,TGF1,bonafide,test
246,-,GT_TGF1_TGF1_E30002,GER,TGF1,GT,TGF1,bonafide,test
247,-,GT_TGF1_TGF1_E30016,GER,TGF1,GT,TGF1,bonafide,test
248,-,GT_TGF1_TGF1_E30017,GER,TGF1,GT,TGF1,bonafide,test


In [11]:
df['language'].value_counts()

ENG    626
FIN    480
MAN    480
GER    475
Name: language, dtype: int64

In [12]:
GT_df['language'].value_counts()

ENG    100
GER     50
FIN     50
MAN     50
Name: language, dtype: int64

In [13]:
all_df = pd.concat([df, GT_df])
all_df.reset_index(drop=True, inplace=True)
all_df

Unnamed: 0,-,utt,language,source,method,target,label,phase
0,-,T11_SEF1_TEF1_E20001,ENG,SEF1,T11,TEF1,spoof,test
1,-,T11_SEF1_TEF1_E20002,ENG,SEF1,T11,TEF1,spoof,test
2,-,T11_SEF1_TEF1_E20003,ENG,SEF1,T11,TEF1,spoof,test
3,-,T11_SEF1_TEF1_E20004,ENG,SEF1,T11,TEF1,spoof,test
4,-,T11_SEF1_TEF1_E20005,ENG,SEF1,T11,TEF1,spoof,test
...,...,...,...,...,...,...,...,...
2306,-,GT_TGF1_TGF1_E30014,GER,TGF1,GT,TGF1,bonafide,test
2307,-,GT_TGF1_TGF1_E30002,GER,TGF1,GT,TGF1,bonafide,test
2308,-,GT_TGF1_TGF1_E30016,GER,TGF1,GT,TGF1,bonafide,test
2309,-,GT_TGF1_TGF1_E30017,GER,TGF1,GT,TGF1,bonafide,test


In [14]:
all_df['language'].value_counts()

ENG    726
FIN    530
MAN    530
GER    525
Name: language, dtype: int64

In [70]:
lang = 'GER'
eng_df = all_df[all_df['language'] == lang]

In [71]:
eng_train, eng_dev, eng_test = np.split(eng_df.sample(frac=1), [int(.4*len(eng_df)), int(.6*len(eng_df))])

In [72]:
eng_train['label'].value_counts()

spoof       189
bonafide     21
Name: label, dtype: int64

In [73]:
eng_dev['label'].value_counts()

spoof       96
bonafide     9
Name: label, dtype: int64

In [74]:
eng_test['label'].value_counts()

spoof       190
bonafide     20
Name: label, dtype: int64

In [75]:
eng_train.reset_index(drop=True, inplace=True)
eng_dev.reset_index(drop=True, inplace=True)
eng_test.reset_index(drop=True, inplace=True)
eng_train['phase'] = ['train']*len(eng_train)
eng_dev['phase'] = ['dev']*len(eng_dev)
eng_test['phase'] = ['test']*len(eng_test)

In [76]:
cm_path = '/home/h-oiso/AudioFake/SSL_Anti-spoofing/LA-keys-stage-1/keys/CM'
eng_train.to_csv(os.path.join(cm_path, f'train_metadata_VCC_{lang}.txt'), sep=' ', header=None, index=False)
eng_dev.to_csv(os.path.join(cm_path, f'dev_metadata_VCC_{lang}.txt'), sep=' ', header=None, index=False)
eng_test.to_csv(os.path.join(cm_path, f'test_metadata_VCC_{lang}.txt'), sep=' ', header=None, index=False)

In [66]:
# all_df.to_csv('test_metadata_VCC.txt', sep=' ', header=None, index=False)