In [None]:
from glob import glob
import os
import pandas as pd
import numpy as np
# connect to postgresql db usingenvironment variable read from vars.env 

import os
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# read environment variables from vars.env
from dotenv import load_dotenv
load_dotenv("../vars.env")

# connect to postgresql db on localhost, post 5432, using user and password from vars.env

import psycopg2
import os
import random 

seed = 42
random.seed(seed)
np.random.seed(seed)

# Define the database credentials
db_host = os.getenv("POSTGRES_HOST")
db_name = os.getenv("POSTGRES_DB")
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PWD")

# Connect to the database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)



data_dir = "/data/tts-qa/tts-data"

# get all folders  in data_dir
folders = glob(os.path.join(data_dir, "*"))

languages = ["English", "French", "Italian", "Spanish"]

# for each folder if it includes one of the languages, get all files list into language dicst 

language_samples_dict = {
    "English": [],
    "French": [],
    "Italian": [],
    "Spanish": []
}

for folder in folders:
    for language in languages:
        if language in folder:
            language_samples_dict[language].extend(glob(os.path.join(folder, "raw", "*")))


In [None]:
# randomly select 25 samples from each language
for language in languages:
    language_samples_dict[language] = np.random.choice(language_samples_dict[language], 25, replace=False)

In [None]:
cols = ['filename', 'original_text', 'asr_text', 'duration',
       'trimmed_audio_duration', 'sentence_type', 'sentence_length',
       'sampling_rate', 'sample_format', 'isPCM', 'n_channel', 'format',
       'peak_volume_db', 'size', 'isValid', 'trim_start', 'trim_end',
       'longest_pause', 'wer'
]

# create a new dir 
new_dir = os.path.join("data/tts-qa/", "subset")
os.makedirs(new_dir, exist_ok=True)

# for each language, create a new folder and copy the samples into it
for language in languages:
    new_language_dir = os.path.join(new_dir, language)
    os.makedirs(new_language_dir, exist_ok=True)
    # create raw and trimmed dirs
    os.makedirs(os.path.join(new_language_dir, "raw"), exist_ok=True)
    os.makedirs(os.path.join(new_language_dir, "trimmed"), exist_ok=True)
    # for samples copy pasted get the sample information from db 
    # and save it in a csv file
    sample_info = []
    for sample in language_samples_dict[language]:
        sample_name = os.path.basename(sample)
        sample_info.append(sample_name)
    sql_script = f"""
    SELECT * FROM sample WHERE filename IN {tuple(sample_info)}
    """

    df = pd.read_sql(sql_script, conn)
    df = df[cols]
    # drop samples with wer > 0.2 and isValid = False
    df = df[(df["wer"] <= 0.2) & (df["isValid"] == True)]

    df.to_csv(os.path.join(new_language_dir, "metadata.csv"), index=False)

    for sample in language_samples_dict[language]:
        # continue if the sample is not in the df
        if os.path.basename(sample) not in df["filename"].values:
            continue
        new_sample_dir = os.path.join(new_language_dir, "raw")
        os.system(f"cp '{sample}' '{new_sample_dir}'")
        trimmed_dir = new_sample_dir.replace("raw", "trimmed")
        os.system(f"cp '{sample.replace('raw', 'trimmed')}' '{trimmed_dir}'")
    
    
    



In [None]:
# zip all folders and sync to aws s3 bucket arn:aws:s3:::ahmet-tts-qa
import zipfile
from zipfile import ZipFile
import os


def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        #when saving the file, we want to save it with the same relative path
        for file in files:
            ziph.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(path, '..')))
        
    

for language in languages:
    zipf = ZipFile(f'{language}.zip', 'w', zipfile.ZIP_DEFLATED)
    zipdir(f'{new_dir}/{language}', zipf)
    zipf.close()
    os.system(f"aws s3 cp {language}.zip s3://user-ahmet/translated-subset-share/{language}.zip")
    os.system(f"rm {language}.zip")
    # os.system(f"rm -rf {new_dir}/{language}")
