In [296]:
import pandas as pd
import numpy as np
from pathlib import Path
from utils import parse_filelist
from text import cmudict

data_dir = Path.cwd() / "../LJ_samples"
splits_dir = Path.cwd() / "resources/filelists/ljspeech"
cmudict_path = 'resources/cmu_dictionary'

dictionary = cmudict.CMUDict(cmudict_path)

# Create new filelists (arpabet convertible samples)

In [297]:
#metadat.csv
filepaths_and_text = parse_filelist(data_dir / "metadata.csv", split_char='|')
df = pd.DataFrame(np.array(filepaths_and_text), columns=["id", "transcript", "norm_transcript"])
for idx in range(10):
    id = df.iloc[idx]["id"]
    if df.loc[idx, "transcript"] != df.loc[idx, "norm_transcript"]:
        print(f"ID: {id}, idx: {idx}")
        print(f"Original: {df.loc[idx, 'transcript']}")
        print(f"Normalized: {df.loc[idx, 'norm_transcript']}")
df.head()

ID: LJ001-0007, idx: 6
Original: the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,
Normalized: the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,


Unnamed: 0,id,transcript,norm_transcript
0,LJ001-0001,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...,For although the Chinese took impressions from...
3,LJ001-0004,"produced the block books, which were the immed...","produced the block books, which were the immed..."
4,LJ001-0005,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...


In [298]:
def get_split_df(filename:str="train.txt"):
    filepaths_and_text = parse_filelist(splits_dir / filename, split_char='|')
    split_df = pd.DataFrame(np.array(filepaths_and_text), columns=["id", "text"])
    split_df["id"] = split_df["id"].apply(lambda x: x.split("/")[-1].split(".")[0])
    return split_df

train_df = get_split_df("train.txt")
valid_df = get_split_df("valid.txt")
test_df = get_split_df("test.txt")

_ = train_df.merge(df, on="id", how="left")
print("splits 'text' is metadata 'norm_transcript': ", np.all(_["norm_transcript"] == _["text"]))

print(f"Train samples: {len(train_df)}, Validation samples: {len(valid_df)}, Test samples: {len(test_df)}")
print(f"Train ratio: {len(train_df) / (len(train_df) + len(valid_df) + len(test_df)):.3f}, "
      f"Validation ratio: {len(valid_df) / (len(train_df) + len(valid_df) + len(test_df)):.3f}, "
      f"Test ratio: {len(test_df) / (len(train_df) + len(valid_df) + len(test_df)):.3f}")

splits 'text' is metadata 'norm_transcript':  True
Train samples: 11947, Validation samples: 95, Test samples: 488
Train ratio: 0.953, Validation ratio: 0.008, Test ratio: 0.039


In [299]:
def good_bad_df(split_df):
    """
    Transcribe the samples from the dataframe to ARPabet
    and return two dataframes:
    1. good_samples_df: samples with valid ARPAbet
    2. bad_samples_df: samples with invalid ARPAbet
    """
    good_samples = []
    bad_samples = []
    for idx in split_df.index:
        id = split_df.loc[idx]["id"]
        text = split_df.loc[idx, "text"]
        #text = "Turn left on {HH AW1 S S T AH0 N} Street."
        cleaner_names=["english_cleaners_v2"]
        arpabets = text_to_arpabet(text, dictionary, cleaner_names)
        arpabets = check_arpabet(arpabets, remove_punctuation=True)
        if arpabets is None:
            bad_samples.append({"id": id,
                                "text": text,
                                "arpabets": arpabets})
        else:
            good_samples.append({"id": id,
                                "text": text,
                                "arpabets": arpabets})
    good_samples_df = pd.DataFrame(good_samples)
    bad_samples_df = pd.DataFrame(bad_samples)
    return good_samples_df, bad_samples_df

train_good_df, train_bad_df = good_bad_df(train_df)
valid_good_df, valid_bad_df = good_bad_df(valid_df)
test_good_df, test_bad_df = good_bad_df(test_df)
print(f"Train good samples: {len(train_good_df)}, \
    Train conversion rate: {len(train_good_df) / len(train_df):.3f}")
print(f"Validation good samples: {len(valid_good_df)}, \
    Validation conversion rate: {len(valid_good_df) / len(valid_df):.3f}")
print(f"Test good samples: {len(test_good_df)}, \
    Test conversion rate: {len(test_good_df) / len(test_df):.3f}")

n_valid = (len(train_good_df) + len(valid_good_df) + len(test_good_df))
print(f"Train ratio: {len(train_good_df) / n_valid:.3f}, "
      f"Validation ratio: {len(valid_good_df) / n_valid:.3f}, "
      f"Test ratio: {len(test_good_df) / n_valid:.3f}")

Train good samples: 9892,     Train conversion rate: 0.828
Validation good samples: 76,     Validation conversion rate: 0.800
Test good samples: 398,     Test conversion rate: 0.816
Train ratio: 0.954, Validation ratio: 0.007, Test ratio: 0.038


In [300]:
train_good_df.head(10)

Unnamed: 0,id,text,arpabets
0,LJ050-0234,It has used other Treasury law enforcement age...,"[{IH1 T}, {HH AE1 Z}, {Y UW1 Z D}, {AH1 DH ER0..."
1,LJ050-0207,Although Chief Rowley does not complain about ...,"[{AO2 L DH OW1}, {CH IY1 F}, {R OW1 L IY0}, {D..."
2,LJ048-0203,The three officers confirm that their primary ...,"[{DH AH0}, {TH R IY1}, {AO1 F AH0 S ER0 Z}, {K..."
3,LJ003-0182,"The tried and the untried, young and old, were...","[{DH AH0}, {T R AY1 D}, {AH0 N D}, {DH AH0}, {..."
4,LJ044-0166,"According to Marina Oswald, he thought that wo...","[{AH0 K AO1 R D IH0 NG}, {T UW1}, {M ER0 IY1 N..."
5,LJ019-0208,The proposal made was to purchase some fifty t...,"[{DH AH0}, {P R AH0 P OW1 Z AH0 L}, {M EY1 D},..."
6,LJ021-0146,I shall seek assurances of the making and main...,"[{AY1}, {SH AE1 L}, {S IY1 K}, {AH0 SH UH1 R A..."
7,LJ014-0083,"which, having possessed herself of the murdere...","[{W IH1 CH}, {HH AE1 V IH0 NG}, {P AH0 Z EH1 S..."
8,LJ035-0121,This is the period during which Oswald would h...,"[{DH IH1 S}, {IH1 Z}, {DH AH0}, {P IH1 R IY0 A..."
9,LJ049-0118,Enactment of this statute would mean that the ...,"[{EH0 N AE1 K T M AH0 N T}, {AH1 V}, {DH IH1 S..."


In [301]:
def write_split_file(filepath, splits_df):
    lines = []
    for row in splits_df[["id", "text"]].values:
        id = row[0]
        text = row[1]
        line = f"DUMMY/{id}.wav|{text}\n"
        lines.append(line)
    with open(filepath, "w") as file:
        file.writelines(lines)
    print(f"Filelist written to {filepath}")


#write_split_file(splits_dir / "train_v0.txt", train_good_df)
#write_split_file(splits_dir / "valid_v0.txt", valid_good_df)
#write_split_file(splits_dir / "test_v0.txt", test_good_df)

# ARPabet to IPA ternary traits

In [302]:
train_df = get_split_df("train_v0.txt")
valid_df = get_split_df("valid_v0.txt")
test_df = get_split_df("test_v0.txt")

traits_list = ["syl", "son", "cons", "cont", "delrel", "lat",
                "nas", "strid", "voi", "sg", "cg", "ant", "cor",
                "distr", "lab", "hi", "lo", "back", "round",
                "velaric", "tense", "long", "hitone", "hireg"]
N_traits = len(traits_list)
emb_dim = N_traits + 1    # add a dim for one hot punctuation/pause token
space_tok = np.zeros((1,emb_dim))
space_tok[0,-1] = 1
print(f"Number of traits: {N_traits} Emb dim: {emb_dim}")

Number of traits: 24 Emb dim: 25


In [303]:
id = 'LJ001-0001'
text = train_df.loc[train_df["id"] == id, "text"].values[0]
text

'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'

In [None]:
from typing import Optional, List
from text.cmudict import CMUDict
from text import _clean_text
from text import _punctuation_list

import re

_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
_composed_re = re.compile(r"\b[a-zA-Z]+(?:-[a-zA-Z]+)+\b")  # composed words with dashes

def text_to_ipa(
    text: str,
    dictionary: Optional[CMUDict] = None,
    cleaner_names: List[str] = ["english_cleaners_v2"],
    remove_punctuation: bool = False,
) -> str:
    """
    Convert text to IPA characters sequence.
    """
    arp_list = text_to_arpabet(text, dictionary, cleaner_names)
    arp_list = check_arpabet(arp_list, remove_punctuation=remove_punctuation)
    if arp_list is None:
        print(f"Unable to convert to ARPAbet : {text}")
        return None
    else:
        # convert ARPAbet to IPA
        ipawords_list = [get_ipa_from_arp(w) for w in arp_list]
        return ipawords_list

def ipa_to_ternary(
    ipawords_list: List[str],
)-> np.ndarray:
    ternary_seq = []
    for word_ipa in ipawords_list:
        if ft.validate_word(word_ipa):
            emb_arr = ft.word_array(traits_list, word_ipa) #shape: (n_chars, N_traits)
            ternary_seq.append(np.pad(emb_arr, ((0, 0), (0, 1)), mode='constant', constant_values=0))
        elif word_ipa in _punctuation_list:
            ternary_seq.append(space_tok)
        else:
            print(f"Word not found in panphon: {word_ipa}")
            continue
    return np.concatenate(ternary_seq, axis=0)

############################
## text to ARPAbet functions
############################

def get_arpabet_dash(
    word: str,
    dictionary: Optional[CMUDict] = None,
) -> str:
    """
    Get ARPAbet transcription for a word, handling dashed composed words.
    More specifically, if the word contains a dash, and is not in the dictionary,
    split the word at the dash and get ARPAbet for each part.
    The parts are then joined with a space
    """
    word_arpabet = dictionary.lookup(word)
    if word_arpabet is not None:
        return "{" + word_arpabet[0] + "}"
    elif _composed_re.match(word):
        words = word.split("-")
        words_arpabet = [get_arpabet_dash(w, dictionary) for w in words]
        return " ".join(words_arpabet)
    else:
        return word


def text_to_arpabet(
    text: str,
    dictionary: Optional[CMUDict] = None,
    cleaner_names: List[str] = ["english_cleaners_v2"],
):
    """
    Convert text to ARPAbet words list.

    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."

    Args:
        text: input text
        dictionary: CMU dictionary
        cleaner_names: list of cleaner names
    Returns:
        ARPAbet words list (List[{"ARP1 ARP2 ...ARPN}" or "PUNC"])
    """
    arp_words = []
    # Check for curly braces and treat their contents as ARPAbet:
    while len(text):
        m = _curly_re.match(text)
        if not m:
            clean_text = _clean_text(text, cleaner_names)
            clean_text = [
                get_arpabet_dash(w, dictionary) for w in clean_text.split(" ")
            ]
            arp_words += clean_text
            break
        else:
            arp_words += text_to_arpabet(m.group(1), dictionary, cleaner_names)
            arp_words += ["{" + m.group(2) + "}"]
            text = m.group(3)
    return arp_words


def check_arpabet(
    arp_words: List[str],
    remove_punctuation: bool = False,
) -> List[str] | None:
    """
    Check if all words are ARPabet encoded (or punctuation).
    If not, return None.

    Args:
        arp_words: list of words
        remove_punctuation: if True, remove punctuation from the list
    Returns:
        list of words ("{ARP1 ARP2 ...ARPN}", or "PUNC") if all are valid
        None otherwise
    """
    mask_arp = [elem.startswith("{") and elem.endswith("}") for elem in arp_words]
    mask_punc = [elem in _punctuation_list for elem in arp_words]
    mask_invalid = [not (arp or punct) for arp, punct in zip(mask_arp, mask_punc)]
    if any(mask_invalid):
        return None
    elif remove_punctuation:
        return [elem for elem in arp_words if elem not in _punctuation_list]
    else:
        return arp_words

###########################
#### ARPAbet to IPA functions
###########################

def get_ipa_from_arp(
        arp_seq: str
        )-> str | None:
    """
    Get IPA transcription for an ARPabet sequence (format "{ARP1 ARP2 ...ARPN}").
    Handles punctuation words as well (".", ",", ...) by returning them as is.
    SHOULD BE CALLED AFTER check_arpabet() to ensure the ARPAbet sequence is valid.
    If the ARPAbet sequence is not valid, return None.

    Args:
        arp_seq: ARPAbet sequence or punctuation string
    Returns:
        IPA transcription : str or None if not found
                            ex : "pɹɪntɪŋ"
    """
    def arpchar_to_ipa(arp: str) -> str | None:
        """
        Get IPA transcription for an ARPAbet character.
        Try to find the original ARPAbet character. If not found,
        fallback to the ARPAbet character without stress markers
        """
        if arp in arpabet2ipa:
            return arpabet2ipa[arp]
        else:
            arp = arp.replace("1", "").replace("2", "").replace("0", "")
            return arpabet2ipa[arp]
    
    if arp_seq.startswith("{") and arp_seq.endswith("}"):
        arp_seq = arp_seq[1:-1].split(" ")
        ipa_seq = [arpchar_to_ipa(arp) for arp in arp_seq]
        return "".join(ipa_seq)
    elif arp_seq in _punctuation_list:
        return arp_seq
    else:
        print("Invalid ARPAbet sequence, should be checked with check_arpabet()")
        return None

In [341]:
ipawords_list = text_to_ipa(text, dictionary, cleaner_names=["english_cleaners_v2"], remove_punctuation=False)
ipawords_list_ = text_to_ipa(text, dictionary, cleaner_names=["english_cleaners_v2"], remove_punctuation=True)
print(ipawords_list_)

['pɹɪntɪŋ', 'ɪn', 'ðə', 'oʊnli', 'sɛns', 'wɪð', 'wɪtʃ', 'wi', 'ɑɹ', 'æt', 'pɹɛzənt', 'kənsɜ˞nd', 'dɪfə˞z', 'fɹʌm', 'moʊst', 'ɪf', 'nɑt', 'fɹʌm', 'ɔl', 'ðə', 'ɑɹts', 'ənd', 'kɹæfts', 'ɹɛpɹəzɛntəd', 'ɪn', 'ðə', 'ɛksəbɪʃən']


In [None]:
import panphon
from utils import intersperse

ft = panphon.FeatureTable()

def ipa_to_ternary(
    ipawords_list: List[str],
)-> np.ndarray:
    ternary_seq = []
    for word_ipa in ipawords_list:
        if ft.validate_word(word_ipa):
            emb_arr = ft.word_array(traits_list, word_ipa) #shape: (n_chars, N_traits)
            ternary_seq.append(np.pad(emb_arr, ((0, 0), (0, 1)), mode='constant', constant_values=0))
        elif word_ipa in _punctuation_list:
            ternary_seq.append(space_tok)
        else:
            print(f"Word not found in panphon: {word_ipa}")
            continue
    return np.concatenate(ternary_seq, axis=0)

In [349]:
ipawords_list = text_to_ipa("qv dfs", dictionary, cleaner_names=["english_cleaners_v2"], remove_punctuation=False)

Unable to convert to ARPAbet : qv dfs


In [350]:
caca = ipa_to_ternary(ipawords_list)
pipi = ipa_to_ternary(intersperse(ipawords_list, " "))

TypeError: 'NoneType' object is not iterable

In [348]:
caca.shape, pipi.shape

((113, 25), (143, 25))

In [333]:
ternary_seq = []
for word_ipa in ipawords_list:
    if ft.validate_word(word_ipa):
        emb_arr = ft.word_array(traits_list, word_ipa) #shape: (n_chars, N_traits)
        ternary_seq.append(np.pad(emb_arr, ((0, 0), (0, 1)), mode='constant', constant_values=0))
    elif word_ipa in _punctuation_list:
        ternary_seq.append(space_tok)
    else:
        print(f"Word not found in panphon: {word_ipa}")
        #ternary_seq.append(space_tok)
        #ternary_seq.append(np.zeros((1, emb_dim)))
        continue
np.concatenate(ternary_seq, axis=0)

array([[-1., -1.,  1., ...,  0.,  0.,  0.],
       [-1.,  1., -1., ...,  0.,  0.,  0.],
       [ 1.,  1., -1., ...,  0.,  0.,  0.],
       ...,
       [-1., -1.,  1., ...,  0.,  0.,  0.],
       [ 1.,  1., -1., ...,  0.,  0.,  0.],
       [-1.,  1.,  1., ...,  0.,  0.,  0.]], shape=(113, 25))

In [None]:
import panphon
from utils import intersperse
from text.cleaners import _punctuation_list
from text.arpabet import arpabet2ipa

ft = panphon.FeatureTable()

#'ɚ' and 'ɝ' were not recognized by panphon, we replaced them with 'ɜ˞' and 'ə˞' respectively
# in the arpabet2ipa dictionary
print(ft.validate_word('ɚ'))
print(ft.validate_word('ɝ'))
print(ft.validate_word('ɜ˞'))
print(ft.validate_word('ə˞'))

for k, v in arpabet2ipa.items():
    if not ft.validate_word(v):
        print(v)

False
False
True
True


In [None]:
ternary_seq = []
for word_ipa in ipa_seq:
    if ft.validate_word(word_ipa):
        emb_arr = ft.word_array(traits_list, word_ipa) #shape: (n_chars, N_traits)
        ternary_seq.append(np.pad(emb_arr, ((0, 0), (0, 1)), mode='constant', constant_values=0))
    elif word_ipa in _punctuation_list:
        ternary_seq.append(space_tok)
    else:
        print(f"Word not found in panphon: {word_ipa}")
        #ternary_seq.append(space_tok)
        #ternary_seq.append(np.zeros((1, emb_dim)))
        continue
np.concatenate(ternary_seq, axis=0)

In [165]:
traits_list = ["syl", "son", "cons", "cont", "delrel", "lat",
                "nas", "strid", "voi", "sg", "cg", "ant", "cor",
                "distr", "lab", "hi", "lo", "back", "round",
                "velaric", "tense", "long", "hitone", "hireg"]

In [166]:
ft.word_array(traits_list, u'bɝ')

array([[-1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1,  0,  1, -1,
        -1, -1, -1, -1,  0, -1,  0,  0]])

4

In [44]:
#for idx in train_df.index[:1]:
#    text = train_df.iloc[idx]["text"]
#    text_to_arpabet(text, dictionary, cleaner_names=["english_cleaners_v2"])

# Audio to art features

In [18]:
from sparc import load_model

print("Available train audio samples")
train_audio_ids = list(set(train_df["id"]).intersection(set([f"LJ001-000{i}" for i in range(1,10)])))
print(train_audio_ids)

Available train audio samples
['LJ001-0006', 'LJ001-0008', 'LJ001-0009', 'LJ001-0005', 'LJ001-0001', 'LJ001-0002', 'LJ001-0007', 'LJ001-0004']


In [19]:
for id in train_audio_ids[:1]:
    audio_path = data_dir / f"{id}.wav"
    if not audio_path.exists():
        print(f"Audio file {audio_path} does not exist")
    else:
        print(f"Audio file {audio_path} exists")
        #play_audio(audio_path)

Audio file /home/anli/Desktop/art-tts/src/../LJ_samples/LJ001-0006.wav exists
