In [None]:
import os
import sys
sys.path.append('../')

import psycopg2
import wave
from pydub import AudioSegment
from tqdm import tqdm
from dotenv import load_dotenv

# Assuming custom_vad_function is a function you have that takes a filename and returns new start and end trim times
from src.utils.audio import CustomVAD, trim_audio
my_custom_vad = CustomVAD(pyannote_model_path="pyannote/segmentation", silero_model_path="snakers4/silero-vad")

# Load environment variables
load_dotenv("../vars.env")

# Database credentials
db_host = os.getenv("POSTGRES_HOST")
db_name = os.getenv("POSTGRES_DB")
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PWD")

# Establish a database connection
conn = psycopg2.connect(host=db_host, database=db_name, user=db_user, password=db_password)
cur = conn.cursor()

# Retrieve all datasets
cur.execute("SELECT id, name FROM dataset;")
datasets = cur.fetchall()



In [None]:
datasets

In [None]:
# for each of the folder under /data/tts-qa/tts-data/ go one level down an generate trimmed2 folder
folders = os.listdir('/data/tts-qa/tts-data/')

for folder in folders:
    if os.path.isdir('/data/tts-qa/tts-data/' + folder):
        if not os.path.exists('/data/tts-qa/tts-data/' + folder + '/trimmed2'):
            os.mkdir('/data/tts-qa/tts-data/' + folder + '/trimmed2')


In [None]:
from src.utils.db_utils import evaluate_audio, convert_to_88k, convert_to_mono, normalize_audio, convert_to_s16le
# Process each dataset
for dataset_id, dataset_name in datasets:
    if dataset_name != 'German(Dorothee)':
        continue
    print(f"Processing dataset: {dataset_name}")

    # Retrieve all samples that are selected for delivery from the current dataset
    cur.execute("""
        SELECT id, filename, local_path, local_trimmed_path
        FROM sample
        WHERE is_selected_for_delivery = TRUE AND
                dataset_id = %s AND  
                local_custom_trimmed_path IS NULL;
    """, (dataset_id,))
    samples = cur.fetchall()

    # Loop over each sample
    for id, filename, local_path, local_trimmed_path in tqdm(samples):
        # Define the new trimmed path  from '/data/tts-qa/tts-data/French(Dorsaf) Deliverable 7/trimmed/FR00054280.wav'
        trimmed2_path = local_trimmed_path.replace('trimmed', 'trimmed2')
        # Run custom VAD to get new trim times
        response = my_custom_vad.process_file(local_path)

        trim_start, trim_end = tuple(response['custom_segment'])

        # round 2 
        trim_start = round(trim_start, 2)
        trim_end = round(trim_end, 2)

        # Load the audio file
        audio = AudioSegment.from_wav(local_path)

        
        # Trim the audio
        trim_audio(local_path, trim_start, trim_end, trimmed2_path)
        meta = evaluate_audio(trimmed2_path)

        if meta["is_88khz"] == False:
            convert_to_88k(local_path, local_path)
        
        if meta["is_mono"] == False:
            convert_to_mono(local_path, local_path)

        if meta["peak_volume_db"] < -6 or meta["peak_volume_db"] > -3:
            normalize_audio(local_path, local_path)

        if meta["isPCM"] == False:
            convert_to_s16le(local_path, local_path)

        try:
            # Update the database with the new trim times (if necessary)
            cur.execute("""
                UPDATE sample
                SET trim_custom_start = %s, trim_custom_end = %s, local_custom_trimmed_path = %s
                WHERE id = %s;
            """, (trim_start, trim_end, trimmed2_path, id))
            conn.commit()
        except Exception as e:
            print(f"Error updating database: {e}")
            os.remove(trimmed2_path)
            conn.rollback()



In [None]:
id

In [None]:
# Close the database connection
cur.close()
conn.close()


In [None]:
import random
import zipfile
import pandas as pd


# Define the path to the zip file
zip_file_path = "random_samples.zip"

# Open the zip file in write mode
with zipfile.ZipFile(zip_file_path, mode="w") as zip_file:
    # Loop over each dataset
    for dataset_id, dataset_name in datasets:
        print(f"Processing dataset: {dataset_name}")

        # Retrieve all samples from the current dataset
        samples = pd.read_sql(f"""
            SELECT id, filename, local_custom_trimmed_path
            FROM sample
            WHERE dataset_id = {dataset_id} and local_custom_trimmed_path is not NULL;
        """, conn)
        
        if len(samples) < 100:
            continue
        # skip the sampls where local_custom_trimmed_path does not exist check by os.path.exists

        samples = samples[samples['local_custom_trimmed_path'].apply(lambda x: os.path.exists(x))]
        # Select random 100 samples
        random_samples = samples.sample(n=100, random_state=42)

        # Loop over each random sample
        for id, filename, local_custom_trimmed_path in tqdm(random_samples.values):
            # Add the sample to the zip file
            zip_file.write(local_custom_trimmed_path, arcname=f"{dataset_name}/{filename}")
