In [32]:
import os
import pandas as pd

In [7]:
release_data=''

In [11]:
def get_utts(data_folder):
    utts = {}
    files = {}
    with open(os.path.join(data_folder, 'text')) as f:
        for line in f.readlines():
            utt = {}
            parts = line.strip().split()
            utt_id = parts[0]
            text = ' '.join(parts[1:])
            utts[utt_id] = {
                'text': text
            }
    with open(os.path.join(data_folder, 'utt2spk')) as f:
        for line in f.readlines():
            parts = line.strip().split()
            utt_id = parts[0]
            spk_id = parts[1]
            utts[utt_id]['spk_id'] = spk_id
    with open(os.path.join(data_folder, 'wav.scp')) as f:
        for line in f.readlines():
            parts = line.strip().split()
            file_id = parts[0]
            path = parts[1]
            files[file_id] = path
    with open(os.path.join(data_folder, 'segments')) as f:
        for line in f.readlines():
            parts = line.strip().split()
            utt_id = parts[0]
            file_id = parts[1]
            start = parts[2]
            end = parts[3]
            utts[utt_id]['file_id'] = file_id
            utts[utt_id]['start'] = start
            utts[utt_id]['end'] = end
    return utts, files

In [12]:
release_utts, release_files = get_utts(release_data)

In [56]:
# create folder structure
release_folder = ''

def create_sox_script(utt, files, output):
    duration = str(float(utt['end']) - float(utt['start']))
    return 'sox {inp} {out} trim {start} {duration}\n'.format(inp=files[utt['file_id']], out=output, start=utt['start'], duration=duration)

def filter_unk_tokens(text):
    return ' '.join([word for word in text.split() if not word == '<UNK>'])

def write_release_files(release_utts, release_files):
    sox_script_lines = []
    speakers = set()
    
    if not os.path.exists(release_folder):
        os.mkdir(release_folder)
    
    word_tokens = 0
    
    for utt_id in release_utts.keys():
        utt = release_utts[utt_id]
        output_folder = os.path.join(release_folder, utt['file_id'])
        transcription_file = os.path.join(output_folder, '{utt_id}.trn'.format(utt_id=utt_id))
        audio_file = os.path.join(output_folder, '{utt_id}.wav'.format(utt_id=utt_id))
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        with open(transcription_file, 'w') as f:
            f.write(filter_unk_tokens(utt['text']))
            word_tokens += len(filter_unk_tokens(utt['text']))
        sox_script_lines.append(create_sox_script(utt, release_files, audio_file))
        speakers.add(utt['spk_id'])
    with open('fin_parliament_sox.sh', 'w') as f:
        for line in sox_script_lines:
            f.write(line)
    with open('fin_parliament_speakers', 'w') as f:
        for spk in sorted(list(speakers)):
            f.write(spk+'\n')
    print(word_tokens)
    
    
        

In [57]:
write_release_files(release_utts, release_files)

333483


In [53]:
def create_speaker_id_mapping():
    spk_file = open('fin_parliament_speakers')
    spkrs = [int(line.strip()) for line in spk_file.readlines()]
    spk_file.close()
    df = pd.read_csv('output/01/parliament_sv-fi_approximated.csv')
    print(df.mp_id.astype(int))
    
    mp_id_to_name = {}
    for i, row in df.iterrows():
        if not int(row.mp_id) in mp_id_to_name:
            mp_id_to_name[int(row.mp_id)] = ' '.join([row.firstname, row.lastname])
    
    with open('speaker_id_mapping.csv', 'w') as f:
        f.write('Speaker ID, Name\n')
        for spk_id in spkrs:
            f.write('{spk_id}, {name}\n'.format(spk_id=spk_id, name=mp_id_to_name[spk_id]))
        
    
    

In [54]:
create_speaker_id_mapping()

0       351
1       941
2      1345
3       769
4       923
       ... 
655    1345
656     943
657    1116
658     910
659     910
Name: mp_id, Length: 660, dtype: int64


In [55]:
len(release_utts)

3889