In [None]:
# connect to postgresql db usingenvironment variable read from vars.env 

import warnings
warnings.filterwarnings("ignore")

import os
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# read environment variables from vars.env
from dotenv import load_dotenv
load_dotenv("../vars.env")

# connect to postgresql db on localhost, post 5432, using user and password from vars.env

import psycopg2
import os

# Define the database credentials
db_host = os.getenv("POSTGRES_HOST")
db_name = os.getenv("POSTGRES_DB")
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PWD")



In [None]:
# remove folders under/data/tts-qa/share_*'that starts with share 

# !rm -rf /data/tts-qa/share_* # be careful with this command, it will remove all folders under /data/tts-qa/share_* 

In [None]:
total_hours = 20
dataset_str = 'German(Dorothee)'

In [None]:

# Define the path to the SQL scriptorder by sample.wer desc 
sql_script = f"""
SELECT
    sample.id,
    sample.filename,
    sample.local_trimmed_path,
    sample.local_path,
    COALESCE(annotation.final_text, sample.original_text) AS text,
    CASE WHEN annotation.final_text IS NULL THEN sample.wer END AS wer,
    sample.duration AS duration
FROM sample
LEFT JOIN annotation ON sample.id = annotation.sample_id AND annotation.status = 'Reviewed'
JOIN dataset ON sample.dataset_id = dataset.id
WHERE dataset.name LIKE '%' || '{dataset_str}' || '%' and sample.asr_text is not null and sample.trimmed_audio_duration > 0;
"""

# Connect to the database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)

# Execute the SQL script into pandas dataframe with column names
df = pd.read_sql_query(sql_script, conn)
df['wer'] = df['wer'].fillna(0)
df.sort_values(by=['wer'], inplace=True, ascending=True)
# df = df[df['wer'] <=0.5]
df.dropna(inplace=True)
# get 10 hours of audio with lowest wer
# find index that the sum of duration is 10 hours

cutoff_idx = df['duration'].cumsum().searchsorted(total_hours*60*60)
df_share = df.iloc[:cutoff_idx]

# make a share fodler
share_folder = f"/data/tts-qa/share_{total_hours}h"
if not os.path.exists(share_folder):
    os.mkdir(share_folder)

# create a language folder under it 
if "English" in dataset_str:
    dataset = "English"
elif "Spanish" in dataset_str:
    dataset = "Spanish"
elif "German" in dataset_str:
    dataset = "German"
elif "French" in dataset_str:
    dataset = "French"
elif "Italian" in dataset_str:
    dataset = "Italian"

lang_folder = os.path.join(share_folder, dataset)
if not os.path.exists(lang_folder):
    os.mkdir(lang_folder)

# create "wav" folder under the language folder
wav_folder = os.path.join(lang_folder, "wavs")
if not os.path.exists(wav_folder):
    os.mkdir(wav_folder)

# copy audio files to the language folder
import shutil
from tqdm import tqdm
for index, row in  tqdm(df_share.iterrows(), total=df_share.shape[0]):
    shutil.copy(row['local_path'], wav_folder)

# drop the local_trimmed_path  and wer, id columns
df_share_clean = df_share.drop(['local_path', 'wer', 'id'], axis=1)
# create a csv file with the same name as the language folder
# sort on by filename
df_share_clean.sort_values(by=['filename'], inplace=True, ascending=True)
df_share_clean.to_csv(os.path.join(lang_folder, dataset + ".csv"), index=False)

In [None]:
import pandas as pd
df = pd.read_csv(f"/data/tts-qa/share_{total_hours}h/{dataset}/{dataset}.csv")
df.head()

In [None]:
df_share_clean.duration.sum() / 60 / 60

In [None]:
df_share.sort_values(by=['wer'], inplace=True, ascending=True)

In [None]:
df_share

In [None]:
df_share.wer.plot.hist(bins=100)

## Create Bash Script for Sharing Data

In [None]:
# write down a bash script to go the /data/tts-qa and zip all the folders in a for loop 
# then upload the zip file to s3://user-ahmet/translated-Spanish-Italian-French-10h.zip

with open(os.path.join('/data/tts-qa', "zip.sh"), "w") as f:
    f.write("#!/bin/bash\n")
    f.write(f"cd /data/tts-qa/share_{total_hours}h\n")
    
    f.write("for d in */ ; do\n")
    # add logging 
    f.write("    echo \"zipping $d\"\n")
    # get the name 
    f.write("    folder=${d%/}\n")
    f.write("    zip -r $folder.zip $d\n")
    f.write("done\n")
    # include ony zip
    f.write(f"aws s3 cp --recursive  --exclude \"*\" --include \"*.zip\" ./ s3://user-ahmet/{dataset}-{total_hours}h/\n")
    f.write("echo \"done\"\n")


In [None]:
# loisten audio
import IPython.display as ipd


index = -3
print(df_share.iloc[index]["filename"])
print(df_share.iloc[index]["text"])
ipd.Audio(df_share.iloc[index]["local_trimmed_path"])


In [None]:
ipd.Audio(df_share.iloc[index]["local_path"])

In [None]:
df_share.tail(10)