In [148]:
import pandas as pd
import numpy as np
from pathlib import Path
from utils import parse_filelist
from text import text_to_arpabet, check_arpabet, cmudict

data_dir = Path.cwd() / "../LJ_samples"
splits_dir = Path.cwd() / "resources/filelists/ljspeech"
cmudict_path = 'resources/cmu_dictionary'

dictionary = cmudict.CMUDict(cmudict_path)

In [149]:
#metadat.csv
filepaths_and_text = parse_filelist(data_dir / "metadata.csv", split_char='|')
df = pd.DataFrame(np.array(filepaths_and_text), columns=["id", "transcript", "norm_transcript"])
for idx in range(10):
    id = df.iloc[idx]["id"]
    if df.loc[idx, "transcript"] != df.loc[idx, "norm_transcript"]:
        print(f"ID: {id}, idx: {idx}")
        print(f"Original: {df.loc[idx, 'transcript']}")
        print(f"Normalized: {df.loc[idx, 'norm_transcript']}")
df.head()

ID: LJ001-0007, idx: 6
Original: the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,
Normalized: the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,


Unnamed: 0,id,transcript,norm_transcript
0,LJ001-0001,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...,For although the Chinese took impressions from...
3,LJ001-0004,"produced the block books, which were the immed...","produced the block books, which were the immed..."
4,LJ001-0005,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...


In [161]:
def get_split_df(filename:str="train.txt"):
    filepaths_and_text = parse_filelist(splits_dir / filename, split_char='|')
    split_df = pd.DataFrame(np.array(filepaths_and_text), columns=["id", "text"])
    split_df["id"] = split_df["id"].apply(lambda x: x.split("/")[-1].split(".")[0])
    return split_df

train_df = get_split_df("train.txt")
valid_df = get_split_df("valid.txt")
test_df = get_split_df("test.txt")

_ = train_df.merge(df, on="id", how="left")
print("splits 'text' is metadata 'norm_transcript': ", np.all(_["norm_transcript"] == _["text"]))

print(f"Train samples: {len(train_df)}, Validation samples: {len(valid_df)}, Test samples: {len(test_df)}")
print(f"Train ratio: {len(train_df) / (len(train_df) + len(valid_df) + len(test_df)):.3f}, "
      f"Validation ratio: {len(valid_df) / (len(train_df) + len(valid_df) + len(test_df)):.3f}, "
      f"Test ratio: {len(test_df) / (len(train_df) + len(valid_df) + len(test_df)):.3f}")

splits 'text' is metadata 'norm_transcript':  True
Train samples: 11947, Validation samples: 95, Test samples: 488
Train ratio: 0.953, Validation ratio: 0.008, Test ratio: 0.039


In [171]:
def good_bad_df(split_df):
    """
    Transcribe the samples from the dataframe to ARPabet
    and return two dataframes:
    1. good_samples_df: samples with valid ARPAbet
    2. bad_samples_df: samples with invalid ARPAbet
    """
    good_samples = []
    bad_samples = []
    for idx in split_df.index:
        id = split_df.loc[idx]["id"]
        text = split_df.loc[idx, "text"]
        #text = "Turn left on {HH AW1 S S T AH0 N} Street."
        cleaner_names=["english_cleaners_v2"]
        arpabets = text_to_arpabet(text, dictionary, cleaner_names)
        arpabets = check_arpabet(arpabets, remove_punctuation=True)
        if arpabets is None:
            bad_samples.append({"id": id,
                                "text": text,
                                "arpabets": arpabets})
        else:
            good_samples.append({"id": id,
                                "text": text,
                                "arpabets": arpabets})
    good_samples_df = pd.DataFrame(good_samples)
    bad_samples_df = pd.DataFrame(bad_samples)
    return good_samples_df, bad_samples_df

train_good_df, train_bad_df = good_bad_df(train_df)
valid_good_df, valid_bad_df = good_bad_df(valid_df)
test_good_df, test_bad_df = good_bad_df(test_df)
print(f"Train good samples: {len(train_good_df)}, \
    Train conversion rate: {len(train_good_df) / len(train_df):.3f}")
print(f"Validation good samples: {len(valid_good_df)}, \
    Validation conversion rate: {len(valid_good_df) / len(valid_df):.3f}")
print(f"Test good samples: {len(test_good_df)}, \
    Test conversion rate: {len(test_good_df) / len(test_df):.3f}")

n_valid = (len(train_good_df) + len(valid_good_df) + len(test_good_df))
print(f"Train ratio: {len(train_good_df) / n_valid:.3f}, "
      f"Validation ratio: {len(valid_good_df) / n_valid:.3f}, "
      f"Test ratio: {len(test_good_df) / n_valid:.3f}")

Train good samples: 9892,     Train conversion rate: 0.828
Validation good samples: 76,     Validation conversion rate: 0.800
Test good samples: 398,     Test conversion rate: 0.816
Train ratio: 0.954, Validation ratio: 0.007, Test ratio: 0.038


In [175]:
train_good_df.head(10)

Unnamed: 0,id,text,arpabets
0,LJ050-0234,It has used other Treasury law enforcement age...,"[{IH1 T}, {HH AE1 Z}, {Y UW1 Z D}, {AH1 DH ER0..."
1,LJ050-0207,Although Chief Rowley does not complain about ...,"[{AO2 L DH OW1}, {CH IY1 F}, {R OW1 L IY0}, {D..."
2,LJ048-0203,The three officers confirm that their primary ...,"[{DH AH0}, {TH R IY1}, {AO1 F AH0 S ER0 Z}, {K..."
3,LJ003-0182,"The tried and the untried, young and old, were...","[{DH AH0}, {T R AY1 D}, {AH0 N D}, {DH AH0}, {..."
4,LJ044-0166,"According to Marina Oswald, he thought that wo...","[{AH0 K AO1 R D IH0 NG}, {T UW1}, {M ER0 IY1 N..."
5,LJ019-0208,The proposal made was to purchase some fifty t...,"[{DH AH0}, {P R AH0 P OW1 Z AH0 L}, {M EY1 D},..."
6,LJ021-0146,I shall seek assurances of the making and main...,"[{AY1}, {SH AE1 L}, {S IY1 K}, {AH0 SH UH1 R A..."
7,LJ014-0083,"which, having possessed herself of the murdere...","[{W IH1 CH}, {HH AE1 V IH0 NG}, {P AH0 Z EH1 S..."
8,LJ035-0121,This is the period during which Oswald would h...,"[{DH IH1 S}, {IH1 Z}, {DH AH0}, {P IH1 R IY0 A..."
9,LJ049-0118,Enactment of this statute would mean that the ...,"[{EH0 N AE1 K T M AH0 N T}, {AH1 V}, {DH IH1 S..."


In [184]:
def write_split_file(filepath, splits_df):
    lines = []
    for row in splits_df[["id", "text"]].values:
        id = row[0]
        text = row[1]
        line = f"DUMMY/{id}.wav|{text}\n"
        lines.append(line)
    with open(filepath, "w") as file:
        file.writelines(lines)
    print(f"Filelist written to {filepath}")


write_split_file(splits_dir / "train_v0.txt", train_good_df)
write_split_file(splits_dir / "valid_v0.txt", valid_good_df)
write_split_file(splits_dir / "test_v0.txt", test_good_df)

Filelist written to /home/anli/Desktop/art-tts/src/resources/filelists/ljspeech/train_v0.txt
Filelist written to /home/anli/Desktop/art-tts/src/resources/filelists/ljspeech/valid_v0.txt
Filelist written to /home/anli/Desktop/art-tts/src/resources/filelists/ljspeech/test_v0.txt
