In [None]:
# connect to postgresql db usingenvironment variable read from vars.env 

import os
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# read environment variables from vars.env
from dotenv import load_dotenv
load_dotenv("../vars.env")

# connect to postgresql db on localhost, post 5432, using user and password from vars.env

import psycopg2
import os

# Define the database credentials
db_host = os.getenv("POSTGRES_HOST")
db_name = os.getenv("POSTGRES_DB")
db_user = os.getenv("POSTGRES_USER")
db_password = os.getenv("POSTGRES_PWD")


# Connect to the database
conn = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_password
)

In [None]:
df = pd.read_csv("/data/tts-qa/share_30h/Italian/Italian.csv")

In [None]:
filenames = df['filename'].tolist()

# get all samples from the database that are in the list of filenames

sql = "SELECT * FROM sample WHERE filename IN %s"


df_samples = pd.read_sql(sql, conn, params=(tuple(filenames),))


In [None]:
df_samples.to_csv("Italian30h-share.csv", index=False)

# Insights

In [None]:
language = "French" # Spanish, French
df = pd.read_csv(f"{language}-share.csv")
df.sort_values(by=['wer'], inplace=True)
df.head()

In [None]:
df.tail()

In [None]:
print(f"There are {len(df)} samples in the {language} dataset.")
print(f"Total duration is {df['trimmed_audio_duration'].sum()/3600:.2f} hours.")

In [None]:
df.wer.describe()

In [None]:
# plot the distribution of WERs

plt.figure(figsize=(8, 6))

sns.distplot(df.wer, kde=True, bins=30, color="b")
plt.title(f"WER distribution for {language} share")
plt.xlabel("WER")
plt.ylabel("Count")


In [None]:
# plot the percentage of the sentence type

df['sentence_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8))
plt.title(f"Sentence type distribution for {language}")


In [None]:
# plot sentence length distribution
df['sentence_length'].plot(kind='hist', bins=100, figsize=(8, 6))
plt.title(f"Sentence length distribution for {language}")

In [None]:
# get average duration for one word 
df['duration_per_word'] = df['trimmed_audio_duration'] / df['sentence_length']

# plot dist of duration per word
df['duration_per_word'].plot(kind='hist', bins=100, figsize=(8, 6))
plt.title(f"Duration per word distribution for {language}")
plt.xlabel("Duration per word (s)")
