# This notebook will create the df_timit and save it as a csv for later use. 
    - We need to step through all the directories in the timit data store and create necessary rows
    - Also need to augment the data with some additional info
    
## Dialect information
     dr1:  New England
     dr2:  Northern
     dr3:  North Midland
     dr4:  South Midland
     dr5:  Southern
     dr6:  New York City
     dr7:  Western
     dr8:  Army Brat (moved around)


In [1]:
import pandas as pd
import numpy as np
import os
import csv
import pdb
from tqdm import tqdm
import random

#increase some pandas vars
pd.set_option('max_colwidth', 100)
sys_name = os.uname()[1]

def data_path_train():    
    if sys_name == 'Hive':
        return "/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/guesswho_true"
        #return "/home/logan/drive/Research/Data_Stores/Timit/Data/timit/TIMIT/TRAIN"
    else:
        return "/home/logan/SynologyDrive/Research/guesswho_new/guesswho18/data/deepfake_data/guesswho_true"
        #return "/Users/logan/Google_Drive/Research/Data_Stores/Timit/Data/timit/TIMIT/TRAIN"
        
def data_path_lyre_true_extend(): 
    return "/home/logan/SynologyDrive/Research/guesswho_new/guesswho18/data/deepfake_data/lyre_bird_comp/real/"

def data_path_lyre_fake_extend(): 
    return "/home/logan/SynologyDrive/Research/guesswho_new/guesswho18/data/deepfake_data/lyre_bird_comp/fake/"
    
def data_path_fakes():
    if sys_name == 'Hive':
        return "/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/guesswho_fakes/"
    else:
        return "//home/logan/SynologyDrive/Research/guesswho_new/guesswho18/data/deepfake_data/guesswho_fakes/"

def data_path_test():
    sys_name = os.uname()[1]
    if sys_name == 'Hive':
        return "/home/logan/drive/Research/Data_Stores/Timit/Data/timit/TIMIT/TEST"
    else:
        return "/home/logan/SynologyDrive/Research/Data_Stores/Timit/Data/timit/TIMIT/TEST"

def append_file(root, file, hive=False, add_slash=False):
    if hive:
        mod_root = "E:\\SynologyDrive\\" + root[root.find('Research'):]
        if add_slash:
            output = mod_root  + '\\' + file
        else:
            output = mod_root  + file
        output =  output.replace('/', '\\')
    else:
        if add_slash:
            output = root + '/' + file
        else:
            output = root + file
    return output

def cut_speaker_id(path):
    return path[path.rfind('/') + 1:]

In [2]:
# arpabet to ipa conversion
arpa_key = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'axr', 'ay', 'eh', 'er', 'ey', 'ih', 'ix', 'iy', \
                'ow', 'oy', 'uh', 'uw', 'ux' , 'b', 'ch', 'd', 'dh', 'dx', 'el', 'em', 'en', 'f', \
                'g', 'h', 'hh', 'jh', 'k', 'l', 'm', 'n', 'ng', 'nx', 'p', 'q', 'r', 's', 'sh', \
                't', 'th', 'v', 'w', 'wh', 'y', 'z', 'zh', 'ax-h', 'bcl', 'dcl', 'eng', 'gcl', 'hv', \
                'kcl', 'pcl', 'tcl', 'pau', 'epi', 'h#']
ipa_key = ['ɑ', 'æ', 'ʌ', 'ɔ', 'aʊ', 'ə', 'ɚ', 'aɪ', 'ɛ', 'ɝ', 'eɪ', 'ɪ', 'ɨ', 'i', 'oʊ', 'ɔɪ', \
                'ʊ', 'u', 'ʉ', 'b', 'tʃ', 'd', 'ð', 'ɾ', 'l̩', 'm̩', 'n̩', 'f', 'ɡ', 'h', 'h', 'dʒ', 'k', \
                'l', 'm', 'n', 'ŋ', 'ɾ̃', 'p', 'ʔ', 'ɹ', 's', 'ʃ', 't', 'θ', 'v', 'w', 'ʍ', 'j', 'z', \
                'ʒ', 'ə̥', 'b̚', 'd̚', 'ŋ̍', 'ɡ̚', 'ɦ', 'k̚', 'p̚', 't̚', 'N/A', 'N/A', 'N/A']

ipa_conversion = dict(zip(arpa_key, ipa_key))
def convert_to_ipa(arpa_key):
    output = []
    for key in arpa_key:
        output.append(ipa_conversion[key])
    return output

arpa_conversion = dict(zip(ipa_key, arpa_key))
def convert_from_ipa(ipa_key):
    output = []
    for key in ipa_key:
        output.append(arpa_conversion.get(key, 'N/A'))
    return output

def join_word_phoneme(df_wrd, df_phn, audio_file):
    new_df = df_wrd.merge(df_phn, 'outer', on=('sample_id', 'speaker_id'), \
                          suffixes=('_word', '_phoneme'))
    new_df = new_df[(new_df.start_phoneme >= new_df.start_word) &
                    (new_df.end_phoneme <= new_df.end_word)]
    try:
        new_df['ipa'] = convert_to_ipa(new_df['arpabet'])
        new_df['filepath'] = audio_file
    except:
        pdb.set_trace()
    if audio_file[audio_file.rfind('/') - 5] == 'M':
        new_df['sex'] = 'm'
    else:
        new_df['sex'] = 'f'
    return new_df

def ipa_join_word_phoneme(df_wrd, df_phn, audio_file):
    new_df = df_wrd.merge(df_phn, 'outer', on=('sample_id', 'speaker_id'), \
                          suffixes=('_word', '_phoneme'))
    new_df = new_df[(new_df.start_phoneme >= new_df.start_word) &
                    (new_df.end_phoneme <= new_df.end_word)]
    try:
        new_df['arpabet'] = convert_from_ipa(new_df['ipa'])
        new_df['filepath'] = audio_file
    except:
        pdb.set_trace()
    if audio_file[audio_file.rfind('/') - 5] == 'M':
        new_df['sex'] = 'm'
    else:
        new_df['sex'] = 'f'
    return new_df

## Lyrebird Master Creation
* Feb 2 2021 (For Usenix submission)

In [3]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_train = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])
for root, dirs, files in list(os.walk(data_path_lyre_true_extend())):     #training data
    for file in tqdm(files, position=0, leave=True):
        if 'wav' in file.lower():
            last_under = file.rfind('_')
            sample_id = file[last_under + 1:-4]
            speaker_id = file[last_under - 5:last_under]
            #get phoneme infor
            phn_file = append_file(root, file[:-4]+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'ipa'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, file[:-4]+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_train = df_train.append(ipa_join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)

#add phoneme indices for bigram analysis
df_train['index_phoneme'] = -1
grouped_df = df_train.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_train.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_train = df_train[df_train.ipa.isin(all_ph)]
df_train.count()[0]

#save dataframe
df_train.to_csv('../../data/lyrebird_true_usenix_master.csv', index=False)
print('Done')

100%|██████████████████████████████████████████████████████| 2/2 [00:00<00:00, 18893.26it/s]
100%|██████████████████████████████████████████████████████| 28/28 [00:00<00:00, 304.65it/s]
100%|██████████████████████████████████████████████████████| 41/41 [00:00<00:00, 269.09it/s]


Done


In [4]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_train = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])
for root, dirs, files in list(os.walk(data_path_lyre_fake_extend())):     #training data
    for file in tqdm(files, position=0, leave=True):
        if 'wav' in file.lower():
            last_under = file.rfind('_')
            sample_id = file[last_under + 1:-4]
            speaker_id = file[last_under - 5:last_under]
            #get phoneme infor
            phn_file = append_file(root, file[:-4]+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'ipa'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, file[:-4]+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_train = df_train.append(ipa_join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)

#add phoneme indices for bigram analysis
df_train['index_phoneme'] = -1
grouped_df = df_train.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_train.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_train = df_train[df_train.ipa.isin(all_ph)]
df_train.count()[0]

#save dataframe
df_train.to_csv('../../data/lyrebird_fake_usenix_master.csv', index=False)
print('Done')

0it [00:00, ?it/s]
100%|██████████████████████████████████████████████████████| 16/16 [00:00<00:00, 269.59it/s]
100%|██████████████████████████████████████████████████████| 20/20 [00:00<00:00, 282.04it/s]


Done


In [5]:
data_path_lyre_true_extend()

'/home/logan/SynologyDrive/Research/guesswho_new/guesswho18/data/deepfake_data/lyre_bird_comp/real/'

## Timit master creation

### Training

In [6]:
#step through all directories in data path, consume all files with s^Y vimame name (different extensions)
#to populate a single row
tmp = 0
df_train = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])
for root, dirs, files in list(os.walk(data_path_train())):     #training data
    for file in tqdm(files, position=0, leave=True):
        if 'wav' in file.lower():
            last_under = file.rfind('_')
            sample_id = file[last_under + 1:-4]
            speaker_id = file[last_under - 5:last_under]
            #get phoneme infor
            phn_file = append_file(root, file[:-4]+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'ipa'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, file[:-4]+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_train = df_train.append(ipa_join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)

#add phoneme indices for bigram analysis
df_train['index_phoneme'] = -1
grouped_df = df_train.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_train.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_train = df_train[df_train.ipa.isin(all_ph)]
df_train.count()[0]

#save dataframe
df_train.to_csv('../../data/timit_master.csv', index=False)
print('Done')

Done


### Testing set

In [7]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_test = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])
for root, dirs, files in tqdm(list(os.walk(data_path_test()))):     #training data
    for file in files:
        if 'wav' in file.lower():
            sample_id = file[:-4]
            speaker_id = cut_speaker_id(root)
            #get phoneme infor
            phn_file = append_file(root, sample_id+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'arpabet'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_test = df_test.append(join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)

#add phoneme indices for bigram analysis
df_test['index_phoneme'] = -1
grouped_df = df_test.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_test.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_test = df_test[df_test.ipa.isin(all_ph)]
df_test.count()[0]

#save dataframe
#df_test.to_csv('../../data/timit_test_master.csv', index=False)
print('Done')

0it [00:00, ?it/s]

Done





In [8]:
df_test.merge(df_train, left_on='speaker_id', right_on='speaker_id')

Unnamed: 0,start_word_x,end_word_x,word_x,sample_id_x,start_phoneme_x,end_phoneme_x,sex_x,arpabet_x,ipa_x,filepath_x,...,word_y,sample_id_y,speaker_id,start_phoneme_y,end_phoneme_y,sex_y,arpabet_y,ipa_y,filepath_y,index_phoneme_y


## Fake master creation
* First cell is for fully labeled data

* This second cell is for word labels only

In [9]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_timit = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', \
             'sex', 'filepath'])

for root, _, files in os.walk(data_path_fakes()):     #deep fakes data
    for file in files:        
        if 'wav' in file.lower():
            sample_id = file[:-4]
            speaker_id = file[16:21]
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD')
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start_word', 'end_word', 'word'])            
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            fpath = append_file(root, file, hive=True)
            df_wrd['filepath'] = fpath
            if fpath[fpath[:fpath.rfind('_')].rfind('_') - 5] == 'M':
                df_wrd['sex'] = 'm'
            else:
                df_wrd['sex'] = 'f'
            df_timit = df_timit.append(df_wrd, ignore_index=True)
print("Len: ", df_timit.count()[0])

df_timit.count()[0]

#save dataframe
df_timit.to_csv('../../data/real_time_master.csv', index=False)

Len:  0


### Gentle

In [10]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_test = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])

data_path = '/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/guesswho_fakes'
for root, dirs, files in tqdm(list(os.walk(data_path)), position=0, leave=True):     #training data
    for file in files:
        if 'wav' in file.lower():
            sample_id = file[:-4]
            speaker_id = file[16:21]            
                
            #get phoneme infor
            phn_file = append_file(root, sample_id+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'ipa'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_test = df_test.append(ipa_join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)


df_raw = df_test
#add phoneme indices for bigram analysis
df_test['index_phoneme'] = -1
grouped_df = df_test.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_test.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_test = df_test[df_test.ipa.isin(all_ph)]
df_test.count()[0]

#save dataframe
df_test.to_csv('../../data/real_time_gentle_master.csv', index=False)
print('Done')

0it [00:00, ?it/s]

Done





In [11]:
df_test.to_csv('../../data/real_time_gentle_master.csv', index=False)

In [12]:
df_test

Unnamed: 0,start_word,end_word,word,sample_id,speaker_id,start_phoneme,end_phoneme,sex,arpabet,ipa,filepath,index_phoneme


# Lyrebird Data

## Real speech

In [13]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_test = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])

data_path = '/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/lyre_bird/true/'
for root, dirs, files in tqdm(list(os.walk(data_path_test)):     #training data
    for file in files:
        if 'wav' in file.lower():
            sample_id = file[:-4]
            if 'trump' in file:
                speaker_id = 'trump'
            else:
                speaker_id = 'obama'
                
            #get phoneme infor
            phn_file = append_file(root, sample_id+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'arpabet'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_test = df_test.append(join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)

#add phoneme indices for bigram analysis
df_test['index_phoneme'] = -1
grouped_df = df_test.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_test.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_test = df_test[df_test.ipa.isin(all_ph)]
df_test.count()[0]

#save dataframe
df_timit.to_csv('../../data/lyre_bird_true_master.csv', index=False)
print('Done')

SyntaxError: invalid syntax (748468952.py, line 8)

## Fake Audio

In [None]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_timit = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', \
             'sex', 'filepath'])
data_path = '/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/lyre_bird/fake/'
for root, _, files in tqdm(list(os.walk(data_path))):     #deep fakes data
    for file in files:        
        if 'wav' in file.lower():
            sample_id = file[:-4]
            if 'trump' in file:
                speaker_id = 'trump'
            else:
                speaker_id = 'obama'
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start_word', 'end_word', 'word'])            
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            fpath = append_file(root, file, hive=True, add_slash=True)
            df_wrd['filepath'] = fpath
            if fpath[fpath[:fpath.rfind('_')].rfind('_') - 5] == 'M':
                df_wrd['sex'] = 'm'
            else:
                df_wrd['sex'] = 'f'
            df_timit = df_timit.append(df_wrd, ignore_index=True)
print("Len: ", df_timit.count()[0])

df_timit.count()[0]

#save dataframe
df_timit.to_csv('../../data/lyre_bird_fake_master.csv', index=False)

# ASV Spoof

## True set

In [None]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_test = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])

data_path = '/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/ASV_spoof/real/'
for root, dirs, files in tqdm(list(os.walk(data_path)), position=0, leave=True):     #training data
    for file in files:
        if 'wav' in file.lower():
            sample_id = file[:-4]
            speaker_id = root[-16:-9]            
                
            #get phoneme infor
            phn_file = append_file(root, sample_id+'.PHN', add_slash=True)
            df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'ipa'])
            df_phn['sample_id'] = sample_id
            df_phn['speaker_id'] = speaker_id
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            df_test = df_test.append(ipa_join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)

#add phoneme indices for bigram analysis
df_test['index_phoneme'] = -1
grouped_df = df_test.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_test.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_test = df_test[df_test.ipa.isin(all_ph)]
df_test.count()[0]

#save dataframe
df_test.to_csv('../../data/asv_spoof_true_master.csv', index=False)
print('Done')

## Fake set

In [None]:
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_test = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', 'start_phoneme', 
        'end_phoneme', 'sex', 'arpabet', 'ipa', 'filepath'])

data_path = '/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/ASV_spoof/fake/'
for root, dirs, files in tqdm(list(os.walk(data_path)), position=0, leave=True):     #training data
    for file in files:
        if 'wav' in file.lower():
            try:
                sample_id = file[:-4]
                speaker_id = root[-16:-9]            

                #get phoneme infor
                phn_file = append_file(root, sample_id+'.PHN', add_slash=True)
                df_phn = pd.read_csv(phn_file, delimiter=' ', names=['start', 'end', 'ipa'])
                df_phn['sample_id'] = sample_id
                df_phn['speaker_id'] = speaker_id

                #get word info
                wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
                df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start', 'end', 'word'])
                df_wrd['sample_id'] = sample_id
                df_wrd['speaker_id'] = speaker_id
                df_test = df_test.append(ipa_join_word_phoneme(df_wrd, df_phn, 
                                    append_file(root, file, hive=True, add_slash=True)), ignore_index=True)
            except:
                #likely that wrd or phn file not created, just skip
                pass

#add phoneme indices for bigram analysis
df_test['index_phoneme'] = -1
grouped_df = df_test.groupby(['speaker_id', 'sample_id', 'start_word'])
for key, item in grouped_df:
    new_indices = list(range(len(item.index_phoneme)))
    df_test.loc[item.index, 'index_phoneme'] = new_indices
    
#filter out unknown ipa values (epi, silence)
all_ph = ['ʃ', 'ɨ', 'ɦ', 'ɛ', 'd̚', 'dʒ', 'ɪ', 'd', 'ʌ', 'k̚', 'k', 's', 'ʉ',
       'ʔ', 'n̩', 'ɡ̚', 'ɡ', 'ɹ', 'w', 'ɔ', 'ɾ', 'ɚ', 'l', 'j', 'ʊ', 'n',
       'æ', 'm', 'ɔɪ', 'ə', 'ð', 't̚', 'i', 'v', 'f', 't', 'p̚', 'oʊ',
       'h', 'tʃ', 'b̚', 'b', 'ɑ', 'm̩', 'ŋ', 'aɪ', 'θ', 'ə̥', 'eɪ',
       'p', 'aʊ', 'ɝ', 'ɾ̃', 'z', 'l̩', 'u', 'ʒ', 'ŋ̍']

df_test = df_test[df_test.ipa.isin(all_ph)]
df_test.count()[0]

#save dataframe
df_test.to_csv('../../data/asv_spoof_gentle_master.csv', index=False)
print('Done')

In [None]:
"""============Old=============================="""
#step through all directories in data path, consume all files with same name (different extensions)
#to populate a single row
tmp = 0
df_timit = pd.DataFrame(columns=['start_word', 'end_word', 'word', 'sample_id', 'speaker_id', \
             'sex', 'filepath'])
data_path = '/home/logan/drive/Research/guesswho_new/guesswho18/data/deepfake_data/lyre_bird/fake/'
for root, _, files in tqdm(list(os.walk(data_path))):     #deep fakes data
    for file in files:        
        if 'wav' in file.lower():
            sample_id = file[:-4]
            speaker_id = root[-16:-9] 
            
            #get word info
            wrd_file = append_file(root, sample_id+'.WRD', add_slash=True)
            df_wrd = pd.read_csv(wrd_file, delimiter=' ', names=['start_word', 'end_word', 'word'])            
            df_wrd['sample_id'] = sample_id
            df_wrd['speaker_id'] = speaker_id
            fpath = append_file(root, file, hive=True, add_slash=True)
            df_wrd['filepath'] = fpath
            df_wrd['sex'] = 'f'

            df_timit = df_timit.append(df_wrd, ignore_index=True)
print("Len: ", df_timit.count()[0])

df_timit.count()[0]

#save dataframe
df_timit.to_csv('../../data/asv_spoof_fake_master.csv', index=False)

## Create consistency validation set

In [None]:
gw_dir = '/home/logan/drive/Research/guesswho18/'     #hive
#gw_dir = '/User/logan/Gogle_Drive/Research/guesswho18/'     #iMac

df_timit_read = pd.read_csv(gw_dir + 'data/timit_master.csv', sep=',',
       dtype = {
           'start_word' : np.int,
           'end_word': np.int,
           'word': np.str,
           'sample_id': np.str,
           'speaker_id': np.str,
           'start_phoneme': np.int,
           'end_phoneme': np.int,
           'arpabet': np.str,
           'ipa': str,
           'filename': np.str,
           'index_phoneme': np.int
       })
print("Timit done...")

In [None]:
paths = list(df_timit_read['filepath'].unique())
import random
random.seed(13)
sampled_paths = random.sample(paths, 490)

In [None]:
df_sample = df_timit_read[df_timit_read.filepath.isin(sampled_paths)]
df_sample = df_sample.reset_index()
df_sample.drop(columns=['index'])
df_sample.to_csv('../../data/consistent_master.csv', index=False)