In [1]:
# connect to postgresql db usingenvironment variable read from vars.env 

import os
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# read environment variables from vars.env
from dotenv import load_dotenv
load_dotenv("../vars.env")

# connect to postgresql db on localhost, post 5432, using user and password from vars.env

import psycopg2
import os

# Define the database credentials
db_host = os.getenv("POSTGRES_HOST")
db_name = os.getenv("POSTGRES_DB")
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PWD")


for dataset in ["English"]:
    print(f"Processing {dataset} dataset")
    # Define the path to the SQL scriptorder by sample.wer desc 
    sql_script = f"""
    SELECT sample.id, sample.filename, sample.local_trimmed_path, sample.original_text as text, sample.wer, sample.trimmed_audio_duration as duration
    FROM sample
    JOIN dataset ON sample.dataset_id = dataset.id
    WHERE dataset.name LIKE '%' || '{dataset}' || '%';
    """

    # Connect to the database
    conn = psycopg2.connect(
        host=db_host,
        database=db_name,
        user=db_user,
        password=db_password
    )

    # Execute the SQL script into pandas dataframe with column names
    df = pd.read_sql_query(sql_script, conn)
    df.sort_values(by=['wer'], inplace=True, ascending=True)

    # get 10 hours of audio with lowest wer
    # find index that the sum of duration is 10 hours

    inx_10h = df['duration'].cumsum().searchsorted(10*60*60)
    df_10h = df.iloc[:inx_10h]


    # make a share fodler
    share_folder = "/data/tts-qa/share_10h"
    if not os.path.exists(share_folder):
        os.mkdir(share_folder)

    # create a language folder under it 
    lang_folder = os.path.join(share_folder, dataset)
    if not os.path.exists(lang_folder):
        os.mkdir(lang_folder)

    # create "wav" folder under the language folder
    wav_folder = os.path.join(lang_folder, "wavs")
    if not os.path.exists(wav_folder):
        os.mkdir(wav_folder)

    # copy audio files to the language folder
    import shutil
    from tqdm import tqdm
    for index, row in  tqdm(df_10h.iterrows(), total=df_10h.shape[0]):
        shutil.copy(row['local_trimmed_path'], wav_folder)

    # drop the local_trimmed_path  and wer, id columns
    df_10h_clean = df_10h.drop(['local_trimmed_path', 'wer', 'id'], axis=1)
    # create a csv file with the same name as the language folder
    # sort on by filename
    df_10h_clean.sort_values(by=['filename'], inplace=True, ascending=True)
    df_10h_clean.to_csv(os.path.join(lang_folder, dataset + ".csv"), index=False)

Processing English dataset


  df = pd.read_sql_query(sql_script, conn)
100%|██████████| 13829/13829 [01:08<00:00, 200.68it/s]


In [None]:
# write down a bash script to go the /data/tts-qa and zip all the folders in a for loop 
# then upload the zip file to s3://user-ahmet/translated-Spanish-Italian-French-10h.zip

with open(os.path.join('/data/tts-qa', "zip.sh"), "w") as f:
    f.write("#!/bin/bash\n")
    f.write("cd /data/tts-qa/share_10h\n")
    
    f.write("for d in */ ; do\n")
    # add logging 
    f.write("    echo \"zipping $d\"\n")
    # get the name 
    f.write("    folder=${d%/}\n")
    f.write("    zip -r $folder.zip $d\n")
    f.write("done\n")
    # include ony zip
    f.write("aws s3 cp --recursive  --exclude \"*\" --include \"*.zip\" ./ s3://user-ahmet/translated-Spanish-Italian-French-10h/\n")


In [2]:
import pandas as pd
df_en = pd.read_csv("/data/tts-qa/share_10h/English/English.csv")
df_en.head()

Unnamed: 0,filename,text,duration
0,EN00000212.wav,I wish her to be brought to me at once.,2.24
1,EN00000521.wav,"Well, I can go back to work now.",2.2
2,EN00000660.wav,"You look as if you've just seen a ghost, old man.",2.84
3,EN00000732.wav,Not too soon for me to see the stars of home.,2.44
4,EN00002023.wav,"We party, but the world will continue without us.",3.12


In [3]:
df_en.duration.sum() / 60 / 60

9.999777777777776