In [None]:
import json
import os
import pickle
import re
from glob import glob

import editdistance
import numpy as np
import pandas as pd
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from pydub import AudioSegment
from tqdm import tqdm
from whisper_model import WhisperASR

import os
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
# read environment variables from vars.env
from dotenv import load_dotenv
load_dotenv("../vars.env")

# connect to postgresql db on localhost, post 5432, using user and password from vars.env

import psycopg2
import os

# Define the database credentials
db_host = os.getenv("POSTGRES_HOST")
db_name = os.getenv("POSTGRES_DB")
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PWD")



def edit_distance(s1, s2):
    return editdistance.eval(s1, s2)


# Connect to the database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)

In [None]:
dataset = "Spanish"
sql_script = f"""
SELECT dataset.name, sample.id, sample.filename, sample.local_trimmed_path, sample.original_text, sample.asr_text, sample.wer, sample.trimmed_audio_duration as duration
FROM sample
JOIN dataset ON sample.dataset_id = dataset.id
WHERE dataset.name LIKE '%' || '{dataset}' || '%';
"""



# Execute the SQL script into pandas dataframe with column names
df = pd.read_sql_query(sql_script, conn)

In [None]:
# group by name and then create a dict of the grouped dataframes

df_dict = {k: v for k, v in df.groupby("name")}

In [None]:
df_matched_list = []
for df_name, df_sentences in df_dict.items():
    print(f"Processing {df_name}")
    df_sentences = df_sentences.reset_index(drop=True)

    sentences = {}
    inverseSentences = {}

    segments = {}

    print(f"There are {len(df_sentences)} sentences in this range")
    for index, row in df_sentences.iterrows():
        sentenceNum = int(index)
        sentence = row["original_text"]
        sentences[sentenceNum] = sentence

        segments[sentenceNum] = row
        if sentence not in inverseSentences:
            inverseSentences[sentence] = sentenceNum
        else:
            tmp = sentence
            while tmp in inverseSentences:
                tmp += " _"
            inverseSentences[tmp] = sentenceNum

    
    sentenceNumber = -1

    segments_list = [v for k, v in segments.items()]
    sentences_list = [v for k, v in sentences.items()]
    distances_matrix = np.ones((len(segments_list), len(sentences))) * 1000




    for ik in tqdm(range(len(segments_list))):
        # for jk in range(ik-500, min(len(segments_list), ik+500)):
        for jk in range(len(sentences_list)):
            try:
                distances_matrix[ik, jk] = edit_distance(segments_list[ik]["asr_text"], sentences_list[jk]) / min(len(segments_list[ik]["asr_text"]), len(sentences_list[jk]))
            except:
                distances_matrix[ik, jk] = np.inf

    # get the best match for each segment
    best_matches = np.argmin(distances_matrix, axis=1)
    best_matched_sentences = [sentences_list[k] for k in best_matches]



    # # make a dataframe
    rows = []
    best_matched_sentences = [sentences_list[k] for k in best_matches]

    # print the results
    for ik in tqdm(range(len(segments_list))):
        asr = segments_list[ik]["asr_text"]
        sentence = best_matched_sentences[ik]
        ed_dist = distances_matrix[ik, best_matches[ik]]
        try:
            len_dif = abs(len(asr) - len(sentence)) / min(len(asr), len(sentence))
        except:
            len_dif = np.inf
        sentenceNumber = inverseSentences[sentence]
        if ed_dist < 0.25 and len_dif < 0.15:
            status = "assigned"
        else:
            status = "not_assigned"

        row = {
            "status": status,
            "originalNumber": ik,
            "original_id": segments_list[ik]["id"],
            "assigned_id": segments[sentenceNumber]["id"],
            "original_sentence": sentences_list[ik],
            "assigned_sentence": sentence,
            "ed_dist": ed_dist,
            "len_dif": len_dif,
        }

        row.update(segments_list[ik])
        rows.append(row)
    # if there is inf  drop it
    df_matched_ = pd.DataFrame(rows)
    df_matched_ = df_matched_[df_matched_["ed_dist"] != np.inf]
    
    diff = df_matched_[df_matched_.original_id != df_matched_.assigned_id]
    diff = diff[diff.status=="assigned"]
    # drop duplicates with higher edit distance
    diff = diff.sort_values("ed_dist").drop_duplicates("assigned_id", keep="first")
    df_matched_list.append(diff)


In [None]:
    
df_matched = pd.concat(df_matched_list)
df_matched = df_matched[df_matched.duplicated("assigned_id", keep=False)].sort_values(["assigned_id", "ed_dist"]).drop_duplicates("assigned_id", keep="first")


df_matched.to_csv("matched.csv", index=False)   


In [None]:
diff.to_csv("diff.csv", index=False)

## Assign the closest match to each row in the dataframe

In [None]:
# create a temporary folder to store the audio files
import sys
sys.path.append("../")
import tempfile
import shutil 
from src.utils.utils import calculate_wer
import traceback

from src.paths import paths
from src.logger import root_logger
import boto3

app_logger = root_logger.getChild("rematch")

BASE_DIR = paths.PROJECT_ROOT_DIR

if load_dotenv(os.path.join(BASE_DIR, "vars.env")):
    app_logger.info("Loaded env vars from vars.env")
else:
    app_logger.error("Failed to load env vars from vars.env")
    exit(1)



temp_dir = tempfile.TemporaryDirectory()

# make raw and trimmed paths 
if not os.path.exists(os.path.join(temp_dir.name, "raw")):
    os.makedirs(os.path.join(temp_dir.name, "raw"))

if not os.path.exists(os.path.join(temp_dir.name, "trimmed")):
    os.makedirs(os.path.join(temp_dir.name, "trimmed"))



In [None]:
df_matched = pd.read_csv("matched-Italian.csv")

In [None]:
df_matched.drop_duplicates("assigned_id", inplace=True)

In [None]:
df_matched

In [None]:

def update_sample(assigned_sample, original_sample):
    # copy the wav file from assigned_sample to temp_dir
    temp_sample = original_sample.copy()
    
    # copy the wav file from assigned_sample to temp_dir ot local path and trimmed path 
    orig_local_path = original_sample["local_path"]
    orig_local_trimmed_path = original_sample["local_trimmed_path"]

    filename = os.path.basename(assigned_sample["local_path"])

    # copy from assigned_local_path to temp_dir with the filename
    shutil.copy(orig_local_path, os.path.join(temp_dir.name, "raw", filename))
    shutil.copy(orig_local_trimmed_path, os.path.join(temp_dir.name, "trimmed", filename))

    temp_sample["filename"] = filename
    temp_sample["local_path"] = os.path.join(temp_dir.name, "raw", filename)
    temp_sample["local_trimmed_path"] = os.path.join(temp_dir.name, "trimmed", filename)
    temp_sample["original_text"] = assigned_sample["original_text"]
    temp_sample["sentence_type"] = assigned_sample["sentence_type"]
    temp_sample["sentence_length"] = assigned_sample["sentence_length"]

    temp_sample["duration"] = assigned_sample["duration"]

    temp_sample["wer"] =  round(float(calculate_wer(temp_sample["original_text"], temp_sample["asr_text"])), 2)
    return temp_sample

In [None]:
# for each assigned sentence,  get the original sentence and the assigned sentence in the dataframe
updated_samples = []
all_samples = []
for i, row in df_matched.iterrows():

    sql_script = f"""
    select * from sample where id = {row.original_id} or id = {row.assigned_id};
    """
    all_samples.append(row.original_id)
    all_samples.append(row.assigned_id)
    # Execute the SQL script into pandas dataframe with column names
    df_tmp = pd.read_sql_query(sql_script, conn)

    if len(df_tmp) != 2:
        print(f"error in {row.original_id} or {row.assigned_id}")
        continue
    
    assigned_sample = df_tmp[df_tmp.id == row.assigned_id].iloc[0].to_dict()
    original_sample = df_tmp[df_tmp.id == row.original_id].iloc[0].to_dict()

    try:
        temp_sample = update_sample(assigned_sample, original_sample)
        updated_samples.append(temp_sample)
    except:
        app_logger.error(f"error in {row.original_id} or {row.assigned_id}")
        continue

In [None]:
all_samples = list(set(all_samples))

In [None]:
s3 = boto3.client("s3", aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"), aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"))
bucket_name = os.environ.get("S3_BUCKET_NAME")
dataset_dir = os.environ.get("S3_DATASET_DIR")

In [None]:

# delete all samples from sql 

sql_script = f"""   
select * from sample where id in ({",".join([str(k) for k in all_samples])});
"""

# get each sample and delete  
df_tmp = pd.read_sql_query(sql_script, conn)

cursor = conn.cursor()
for i, row in df_tmp.iterrows():
    try:
        # delete from sql
        sql_script = f"""
        delete from sample where id = {row.id};
        """
        cursor.execute(sql_script)
        conn.commit()
        app_logger.info(f"Deleted sample {row.id} from sql")
    except Exception as e:
        app_logger.error(f"Failed to delete sample {row.id} from sql")
        continue
    try:
        # delete the local files
        os.remove(row.local_path)
        os.remove(row.local_trimmed_path)
        # delete the s3 from full paths:  looks like 's3://user-ahmet/tts-data/Spanish(Violeta) Deliverable 7/raw/ES00110380.wav'
        s3.delete_object(Bucket=bucket_name, Key=row.local_path.replace(f"s3://{bucket_name}/{dataset_dir}/", ""))
        s3.delete_object(Bucket=bucket_name, Key=row.local_trimmed_path.replace(f"s3://{bucket_name}/{dataset_dir}/", ""))
        app_logger.info(f"Deleted sample {row.id} from sql, local and s3")
    except Exception as e:
        app_logger.error(f"Failed to delete sample {row.id} from sql, local and s3")
        app_logger.error(traceback.format_exc())
        continue




In [None]:
len(updated_samples)

In [None]:

cursor = conn.cursor()
# insert the new samples into sql
for sample in updated_samples:
    # insert the new samples into sql :

    # sample looks like this {'dataset_id': 35, ... }

    old_filename = sample['s3RawPath'].split("/")[-1]

    # replace the filename with the new one
    sample['s3RawPath'] = sample['s3RawPath'].replace(old_filename, sample['filename'])
    sample['s3TrimmedPath'] = sample['s3TrimmedPath'].replace(old_filename, sample['filename'])

    # update the paths
    local_folder = sample['s3RawPath']
    # remove the filename
    local_path = local_folder.replace(f"s3://{bucket_name}/", "/data/tts-qa/")

    local_trimmed_folder = sample['s3TrimmedPath']
    # remove the filename
    local_trimmed_folder = local_trimmed_folder.replace(f"s3://{bucket_name}/", "/data/tts-qa/")

    # copy paste the files to local
    if not os.path.exists(os.path.dirname(sample['local_path'])):
        app_logger.info(f"Folder does not exists {os.path.dirname(sample['local_path'])}")
        continue
    if not os.path.exists(os.path.dirname(sample['local_trimmed_path'])):
        app_logger.info(f"Folder does not exists {os.path.dirname(sample['local_trimmed_path'])}")
        continue
    
    try:
        # copy local files to 
        shutil.copyfile(sample['local_path'], local_path)
        shutil.copyfile(sample['local_trimmed_path'], local_trimmed_folder)

        # update the local paths
        sample['local_path'] = local_path
        sample['local_trimmed_path'] = local_trimmed_folder

        # upload the files to the s3 path 
        s3.upload_file(sample['local_path'], bucket_name, sample['s3RawPath'].replace(f"s3://{bucket_name}/", ""))
        s3.upload_file(sample['local_trimmed_path'], bucket_name, sample['s3TrimmedPath'].replace(f"s3://{bucket_name}/", ""))
    except Exception as e:
        pass
    app_logger.info(f"Uploaded sample {sample['id']} to s3")
    try:
        sql_script = f"""
                insert into sample (dataset_id, filename, local_path, local_trimmed_path, "s3RawPath", "s3TrimmedPath", original_text, asr_text, duration, trimmed_audio_duration, sentence_type, sentence_length, sampling_rate, sample_format, "isPCM", n_channel, format, peak_volume_db, size, "isValid", trim_start, trim_end, longest_pause, wer)
                values ({sample['dataset_id']}, '{sample['filename']}', '{sample['local_path']}', '{sample['local_trimmed_path']}', '{sample['s3RawPath']}', '{sample['s3TrimmedPath']}', '{sample['original_text'].replace("'", "''")}', '{sample['asr_text'].replace("'", "''")}',{sample['duration']}, {sample['trimmed_audio_duration']}, '{sample['sentence_type']}', {sample['sentence_length']}, {sample['sampling_rate']}, '{sample['sample_format']}', {sample['isPCM']}, {sample['n_channel']}, '{sample['format']}', {sample['peak_volume_db']}, {sample['size']}, {sample['isValid']}, {sample['trim_start']}, {sample['trim_end']}, {sample['longest_pause']}, {sample['wer']});
                """
        cursor.execute(sql_script)
        conn.commit()

        app_logger.info(f"Inserted sample with informations {sample} into sql")
        app_logger.info(f"Inserted sample {sample['id']} into sql")
    except Exception as e:
        app_logger.error(f"Failed to insert sample {sample['id']} into sql")
        app_logger.error(traceback.format_exc())
        continue
cursor.close()

In [None]:
conn.rollback()

In [None]:
# Close the cursor and connection

conn.close()

In [None]:
# save updated_samples 
import json
with open("updated_samples.json", "w") as f:
    json.dump(updated_samples, f, indent=4)

In [None]:
# load the updated_samples
import json
with open("updated_samples.json", "r") as f:
    updated_samples = json.load(f)
    