In [1]:
import os, glob
import pandas as pd
import numpy as np

In [2]:
corpus_name = 'atc0'
root_path = '/project/graziul/ra/shiyanglai/corpus/'
corpus_prefix = '/project/graziul/data/corpora/atc0_comp/'

def make_corpus_structure(root_path, corpus_name):
    if not os.path.exists(root_path + corpus_name):
        os.makedirs(root_path + corpus_name)
        # create three subfolders: lists, rttms, uems
        if not os.path.exists(root_path + corpus_name + '/lists'):
            os.makedirs(root_path + corpus_name + '/lists')
        else:
            print(f"Directory '{root_path + corpus_name + '/lists'}' already exist. Please delete it and try again.")

        if not os.path.exists(root_path + corpus_name + '/rttms'):
            os.makedirs(root_path + corpus_name + '/rttms')
            os.makedirs(root_path + corpus_name + '/rttms/train')
            os.makedirs(root_path + corpus_name + '/rttms/dev')
            os.makedirs(root_path + corpus_name + '/rttms/test')
        else:
            print(f"Directory '{root_path + corpus_name + '/rttms'}' already exist. Please delete it and try again.")

        if not os.path.exists(root_path + corpus_name + '/uems'):
            os.makedirs(root_path + corpus_name + '/uems')
            os.makedirs(root_path + corpus_name + '/uems/train')
            os.makedirs(root_path + corpus_name + '/uems/dev')
            os.makedirs(root_path + corpus_name + '/uems/test')
        else:
            print(f"Directory '{root_path + corpus_name + '/uems'}' already exist. Please delete it and try again.")

        print('The structure of corpus has been set successfully.')
    else:
        print(f"Directory '{root_path + corpus_name}' already exist. Please delete it and try again.")

In [3]:
# split train, development, test files
def split_atc0(train_rate=0.6, dev_rate=0.2):
    all_files = glob.glob('/project/graziul/data/corpora/atc0_comp/*/data/audio/*')
    all_files = [f for f in all_files if f.endswith('.sph')]
    # random sampling
    train = all_files[:int(len(all_files)*train_rate)]
    devlopment = all_files[len(train):len(train)+int(len(all_files)*0.2)]
    test = all_files[len(train)+len(devlopment):]
    # inform the spliting results
    print('Corpus splitting successfully.')
    print(f'Training sample: {len(train)}.\nDevelopment sample: {len(devlopment)}.\nTesting sample: {len(test)}.')
    return train, devlopment, test

In [4]:
# generate lists
def generate_list(root_path, train, development, test):
    for name, data in zip(['train.txt', 'dev.txt', 'test.txt'], [train, development, test]):
        with open(root_path + '/lists/' + name, 'w') as f:
            uris = [audio.split('/')[-1].split('.')[0] + '\n' for audio in data]
            f.writelines(uris)
            f.close()
    print('.lst files are created successfully.')

In [24]:
# generate rttms
def generate_rttms(root_path, train, development, test, reference, prefix_ignore=''):
    utt = pd.read_csv(reference)
    # train files
    for path in train:
        uri = path.split('/')[-1].split('.')[0]
        sub_df = utt[utt.filePath == path.replace(prefix_ignore, '')]
        file = open(root_path+'/rttms/train/'+uri+'.rttm', 'w')
        for index, row in sub_df.iterrows():
            try:
                file.write(f"SPEAKER {uri} 1 {row['start']} {float(row['end'])-float(row['start'])} <NA> <NA> {row['speaker'].replace(' ', '')} <NA> <NA>" + "\n")
            except BaseException:
                print(f"Incorrect training instance in .csv file row {index}.")
        file.close()
    for path in development:
        uri = path.split('/')[-1].split('.')[0]
        sub_df = utt[utt.filePath == path.replace(prefix_ignore, '')]
        file = open(root_path+'/rttms/dev/'+uri+'.rttm', 'w')
        for index, row in sub_df.iterrows():
            try:
                file.write(f"SPEAKER {uri} 1 {row['start']} {float(row['end'])-float(row['start'])} <NA> <NA> {row['speaker'].replace(' ', '')} <NA> <NA>" + "\n")
            except BaseException:
                print(f"Incorrect development instance in .csv file row {index}.")
        file.close()
    for path in test:
        uri = path.split('/')[-1].split('.')[0]
        sub_df = utt[utt.filePath == path.replace(prefix_ignore, '')]
        file = open(root_path+'/rttms/test/'+uri+'.rttm', 'w')
        for index, row in sub_df.iterrows():
            try:
                file.write(f"SPEAKER {uri} 1 {row['start']} {float(row['end'])-float(row['start'])} <NA> <NA> {row['speaker'].replace(' ', '')} <NA> <NA>" + "\n")
            except BaseException:
                print(f"Incorrect testing instance in .csv file row {index}.")
        file.close()
    print(".rttm files created successfully!")

In [7]:
# generate uems
def generate_uems(root_path, train, development, test, reference, prefix_ignore=''):
    utt = pd.read_csv(reference)
    # train files
    for path in train:
        uri = path.split('/')[-1].split('.')[0]
        sub_df = utt[utt.filePath == path.replace(prefix_ignore, '')]
        file = open(root_path+'/uems/train/'+uri+'.uem', 'w')
        file.write(f"{uri} 1 0.000 {sub_df.tail(1)['end'].values[0]}")
        file.close()
    for path in development:
        uri = path.split('/')[-1].split('.')[0]
        sub_df = utt[utt.filePath == path.replace(prefix_ignore, '')]
        file = open(root_path+'/uems/dev/'+uri+'.uem', 'w')
        file.write(f"{uri} 1 0.000 {sub_df.tail(1)['end'].values[0]}")
        file.close()
    for path in test:
        uri = path.split('/')[-1].split('.')[0]
        sub_df = utt[utt.filePath == path.replace(prefix_ignore, '')]
        file = open(root_path+'/uems/test/'+uri+'.uem', 'w')
        file.write(f"{uri} 1 0.000 {sub_df.tail(1)['end'].values[0]}")
        file.close()
    print(".uem files created successfully!")

In [25]:
# make_corpus_structure(root_path, corpus_name)
# train, dev, test = split_atc0()
generate_list(root_path=root_path+corpus_name, train=train, development=dev, test=test)
generate_rttms(root_path='/project/graziul/ra/shiyanglai/corpus/atc0', train=train, development=dev, test=test,
               reference='/project/graziul/data/corpora/atc0_comp/atc0.csv', prefix_ignore=corpus_prefix)
generate_uems(root_path='/project/graziul/ra/shiyanglai/corpus/atc0', train=train, development=dev, test=test,
             reference='/project/graziul/data/corpora/atc0_comp/atc0.csv', prefix_ignore=corpus_prefix)

.lst files are created successfully.
Incorrect testing instance in .csv file row 9009.
.rttm files created successfully!
.uem files created successfully!


In [12]:
data

NameError: name 'data' is not defined

In [34]:
corpus_name = 'cpd'
root_path = '/project/graziul/ra/shiyanglai/corpus/'
make_corpus_structure(root_path, corpus_name)

In [23]:
data = pd.read_csv('/project/graziul/data/corpora/atc0_comp/atc0.csv', index_col=0)
' AA'.replace(' ', '')

'AA'

In [51]:
data = data.rename(columns={'path': 'filePath', 'start_time': 'start', 'end_time': 'end'})
data['speaker'] = '00001'
data.to_csv('dataset_cpd.csv')

In [49]:
def split_cpd(reference, train_rate=0.6, dev_rate=0.2):
    utt = pd.read_csv(reference)
    all_files = list(set(utt.filePath))
    train = all_files[:int(len(all_files)*train_rate)]
    devlopment = all_files[len(train):len(train)+int(len(all_files)*0.2)]
    test = all_files[len(train)+len(devlopment):]
    # inform the spliting results
    print('Corpus splitting successfully.')
    print(f'Training sample: {len(train)}.\nDevelopment sample: {len(devlopment)}.\nTesting sample: {len(test)}.')
    return train, devlopment, test

In [52]:
train, dev, test = split_cpd('dataset_cpd.csv') 
generate_list(root_path=root_path+corpus_name, train=train, development=dev, test=test)
generate_rttms(root_path='/project/graziul/ra/shiyanglai/corpus/cpd', train=train, development=dev, test=test,
               reference='dataset_cpd.csv', prefix_ignore='')
generate_uems(root_path='/project/graziul/ra/shiyanglai/corpus/atc0', train=train, development=dev, test=test,
             reference='dataset_cpd.csv', prefix_ignore='')

Corpus splitting successfully.
Training sample: 97.
Development sample: 32.
Testing sample: 34.
.lst files are created successfully.
.rttm files created successfully!
.uem files created successfully!
