In [39]:
'''
We ingest the data.ods and reformat to
client_id \t path \t sentence_id \t sentence \t locale

Here
client_id is a unique identifier for the song
path is a path to the song
sentence_id is a unique identifier for the sentence
sentence is the Gaelic text
sentence_domain=up_votes=down_votes=age=gender=accents=variant
locale is set to ga-IE

The client_id can be extracted from the filename "song_x,..wav"
The sentence_id can be extracted from the "..., phrase_xx..wav"

The filenames can be coupled to the filenames via the phrase_number, which is in on the data.ods
'''


'\nWe ingest the data.ods and reformat to\nclient_id \t path \t sentence_id \t sentence \t locale\n\nHere\nclient_id is a unique identifier for the song\npath is a path to the song\nsentence_id is a unique identifier for the sentence\nsentence is the Gaelic text\nsentence_domain=up_votes=down_votes=age=gender=accents=variant\nlocale is set to ga-IE\n\nThe client_id can be extracted from the filename "song_x,..wav"\nThe sentence_id can be extracted from the "..., phrase_xx..wav"\n\nThe filenames can be coupled to the filenames via the phrase_number, which is in on the data.ods\n'

In [40]:
%load_ext autoreload
%autoreload 2
import os
import re
import hashlib
import pandas as pd
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
os.chdir('/media/bramiozo/DATA-FAST/TTS/tts_models/gle/seannos_datasource')

In [42]:
lyrics_original =  pd.read_excel('dataset_lyrics/lyrics.ods')

In [43]:
lyrics_original.dtypes

phrase_number     int64
sentence         object
Title            object
English          object
dtype: object

In [44]:
hashlib.md5(b"bladiebla").hexdigest()

'29940b146d722a311446ca2d68f9739d'

In [45]:
def get_song_str(fn):
    part_1 = (fn.split(',')[0]).strip()
    part_2 = part_1.split("_")[-2:]
    return "_".join(part_2)

In [46]:
filenames = os.listdir('clips_processed')

file_df = pd.DataFrame([{'client_id': hashlib.md5(b""+(f"{get_song_str(s)}").encode("latin1")).hexdigest(),
  'path': s,
  'song': s.split(",")[0].strip().split("_")[-1],
  'phrase_version': s.split("_")[0],
  'phrase_number': int(s.split(',')[1].strip().split("_")[1])
  } for s in filenames])

In [47]:
file_df

Unnamed: 0,client_id,path,song,phrase_version,phrase_number
0,e687cfb58aa0a2abf7dc596503cfcf68,"prep3_song_34,phrase_722,time_113-125.wav",34,prep3,722
1,5641ceb7957ec589f136638189c5fc5d,"prep5_song_7, phrase_134,time_311-321.wav",7,prep5,134
2,a168b2952fd26fad6352725ad5d32c89,"prep1_song_1, phrase_21,time_119-122.wav",1,prep1,21
3,5200d298dcdf65f11f9b738a8b0b68dd,"prep5_song_19,phrase_457,time_00-06.wav",19,prep5,457
4,a05faf2da14346a9eebcb1cef52cdf70,"prep6_song_16,phrase_395,time_22-30.wav",16,prep6,395
...,...,...,...,...,...
6008,717a92c1cd9b05391b5e20b6ddb6e98e,"prep4_song_37,phrase_773,time_203-208.wav",37,prep4,773
6009,5641ceb7957ec589f136638189c5fc5d,"prep2_song_7, phrase_120,time_41-52.wav",7,prep2,120
6010,2bfd305a6c139c602fcf11065c368d67,"prep4_song_9, phrase_184,time_113-119.wav",9,prep4,184
6011,5641ceb7957ec589f136638189c5fc5d,"prep6_song_7, phrase_143,time_435-442.wav",7,prep6,143


In [48]:
final_df = lyrics_original[['phrase_number', 'sentence']].merge(file_df, how='inner', on='phrase_number')

In [49]:
final_df = final_df.assign(locale='ga-IE')
final_df = final_df.assign(sentence_domain='')
final_df = final_df.assign(up_votes='')
final_df = final_df.assign(down_votes='')
final_df = final_df.assign(age='')
final_df = final_df.assign(gender='')
final_df = final_df.assign(accents='')
final_df = final_df.assign(variant='')

In [50]:
gender_map = {
    '1': 'male',
    '2': 'female',
    '3': 'female',
    '4': 'male',
    '5': 'female',
    '6': 'female', 
    '7': 'male',
    '8': 'male',
    '9': 'male',
    '11': 'female',
    '12': 'female',
    '13': 'female',
    '14': 'female',
    '15': 'male',
    '16': 'male',
    '17': 'female',
    '18': 'male',
    '19': 'male',
    '22': 'female',
    '23': 'female',
    '24': 'female',
    '25': 'female',
    '26': 'female',
    '27': 'female',
    '28': 'female',
    '29': 'female',
    '30': 'male',
    '31': 'female',
    '32': 'female', 
    '33': 'female',
    '34': 'female',
    '35': 'female',
    '36': 'female',
    '37': 'female',
    '38': 'male',
    '39': 'female',   
}

client_map = {
    'fb10b1b655a2b34af288a4ff51460080': 'female',
    'eb6e87c69c6bb1914b33b66e22c43b22': 'female',
    'ce29c518eaf3469a00c1b49df4a6fa58': 'female',
    'b9ce569cc8664a79633c2e49b6951772': 'female',
    'b501dd4c046f528a66c085a29a77fa96': 'female',
    'ad7c36b78f0904b97958275bc316fc4a': 'female',
    'a168b2952fd26fad6352725ad5d32c89': 'male',
    'a05faf2da14346a9eebcb1cef52cdf70': 'male',
    '9d6bf453b18c2401cf42e86fd7084048': 'male',
    '9be4875162b0b52ac5853090d9816d9e': 'female',
    '8a3d80b09d305f7e61710569dacd7b8a': 'male',
    '82072466fe7e953258121bfed776c4c8': 'male',
    '8059835b0296b8f2d8f88ee9d5ff2bc1': 'female',
    '7cc5fa83c13a9bba0058047168213fd3': 'female',
    '7396e94dda5897f01e213ea8895f5f34': 'female',
    '6685ae01c52fccb1b73d803bc0ac65f2': 'female',
    '5d179b5ee524d198cb483c6b6ff22e56': 'female',
    '5641ceb7957ec589f136638189c5fc5d': 'male',
    '53deaae6c3c39e7db142a88fb1859ba8': 'female',
    '5200d298dcdf65f11f9b738a8b0b68dd': 'male',
    '42a114cfae45f0e7562ecee67c7c4b9b': 'female',
    '40b30d9f3e6d06f3d9cc6d118b04fda7': 'male',
    '3f4d025bf63ba0bd11873d448217c394': 'female',
    '3c514de0167d3e12cda5f1221269ccfa': 'male',
    '3b1b99b2d042b9228b9b6ed7ba6ef0cf': 'female',
    '2d6587eaf934b300a40091b3df85ce66': 'female',
    '2bfd305a6c139c602fcf11065c368d67': 'male',
    '23f2d50b628d98ecd1458b53ebd1e0c4': 'male',
    '24b00678a1add71b1a69837a0e2478c1': 'female',
    '5599a98ac42fb2d3b2618ca52d4ee48c': 'female',
    'e687cfb58aa0a2abf7dc596503cfcf68': 'female',
    '666ac977145186527ee2ee11e0b8335e': 'female',
    '40291794d294568b8e881d84b63b4d4a': 'female',
    '717a92c1cd9b05391b5e20b6ddb6e98e': 'female',
    '03c0170448334d0b0cfab68ee3690b7c': 'male',
    'e30aaf33d6b93dc7b766641a2a03efc1': 'female'
}


In [51]:
final_df = final_df.assign(gender=final_df.song.map(gender_map))
#final_df = final_df.assign(client_id=final_df.client_id.map(client_map))

In [52]:
final_df = final_df.dropna(subset=['sentence'], axis=0)

In [53]:
final_df = final_df.assign(sentence_id = final_df.sentence.apply(lambda s: hashlib.md5(s.encode("utf-8")).hexdigest() ))

In [63]:
final_df.sentence = final_df.sentence.str.replace('*','', regex=False)
final_df.sentence = final_df.sentence.str.replace('-',', ', regex=False)
final_df.sentence = final_df.sentence.str.replace('`',"'", regex=False)
final_df.sentence = final_df.sentence.str.replace('’',"'", regex=False)
final_df.sentence = final_df.sentence.str.replace('‘',"'", regex=False)

In [64]:
columns = ['client_id', 'path', 'sentence', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale']

# from client_id select 2 sentences for validation and remove from final_df
test_df = final_df.groupby('client_id').apply(lambda x: x.sample(n=2, replace=False)).drop('client_id', axis=1).reset_index()
rm_indcs = test_df.level_1
test_df = test_df.drop('level_1', axis=1)

In [65]:
mask = np.ones(len(final_df), dtype=bool)
mask[rm_indcs] = False
train_df = final_df.iloc[mask]

In [66]:
test_df[columns].to_csv('test.tsv', sep='\t', index=False)
train_df[columns].to_csv('train.tsv', sep='\t', index=False)

In [67]:
train_df

Unnamed: 0,phrase_number,sentence,client_id,path,song,phrase_version,locale,sentence_domain,up_votes,down_votes,age,gender,accents,variant,sentence_id
0,1,"'Sé do bheatha, a bhean ba léanmhar,",a168b2952fd26fad6352725ad5d32c89,"prep1_song_1,phrase_1, ime_range0-5.wav",1,prep1,ga-IE,,,,,male,,,6970c4d40220f150755cc0a753f3859d
1,1,"'Sé do bheatha, a bhean ba léanmhar,",a168b2952fd26fad6352725ad5d32c89,"prep5_song_1,phrase_1, ime_range0-5.wav",1,prep5,ga-IE,,,,,male,,,6970c4d40220f150755cc0a753f3859d
2,1,"'Sé do bheatha, a bhean ba léanmhar,",a168b2952fd26fad6352725ad5d32c89,"prep3_song_1,phrase_1, ime_range0-5.wav",1,prep3,ga-IE,,,,,male,,,6970c4d40220f150755cc0a753f3859d
3,1,"'Sé do bheatha, a bhean ba léanmhar,",a168b2952fd26fad6352725ad5d32c89,"prep6_song_1,phrase_1, ime_range0-5.wav",1,prep6,ga-IE,,,,,male,,,6970c4d40220f150755cc0a753f3859d
4,1,"'Sé do bheatha, a bhean ba léanmhar,",a168b2952fd26fad6352725ad5d32c89,"prep7_song_1,phrase_1, ime_range0-5.wav",1,prep7,ga-IE,,,,,male,,,6970c4d40220f150755cc0a753f3859d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5924,859,Agus bheadh sí i gcónaí agam féin.,e30aaf33d6b93dc7b766641a2a03efc1,"prep2_song_39,phrase_859,time_139-140.wav",39,prep2,ga-IE,,,,,female,,,fc4386880d31bbcbc874880bae34dc83
5925,859,Agus bheadh sí i gcónaí agam féin.,e30aaf33d6b93dc7b766641a2a03efc1,"prep4_song_39,phrase_859,time_139-140.wav",39,prep4,ga-IE,,,,,female,,,fc4386880d31bbcbc874880bae34dc83
5926,859,Agus bheadh sí i gcónaí agam féin.,e30aaf33d6b93dc7b766641a2a03efc1,"prep3_song_39,phrase_859,time_139-140.wav",39,prep3,ga-IE,,,,,female,,,fc4386880d31bbcbc874880bae34dc83
5927,859,Agus bheadh sí i gcónaí agam féin.,e30aaf33d6b93dc7b766641a2a03efc1,"prep6_song_39,phrase_859,time_139-140.wav",39,prep6,ga-IE,,,,,female,,,fc4386880d31bbcbc874880bae34dc83


In [68]:
import wave
def get_wav_duration(filepath):
    with wave.open(filepath, 'rb') as wav_file:
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
        return int(duration * 1000)  

In [69]:
duration_df = []
for f in filenames:
    duration = get_wav_duration(os.path.join('clips_processed', f))
    duration_df.append({'clip': f,  'duration[ms': duration})
    
pd.DataFrame(duration_df).to_csv('clip_durations.tsv', sep='\t', index=False)

