In [57]:
import pandas as pd
import numpy as np
from pathlib import Path
from paths import DATA_DIR
from voxcommunis.io import read_alignment, read_manifest
from voxcommunis.data import (PanPhonInventory, FeatureTokenizer, PhoneticFeatureDataset,
                              SAMPLE_RATE, ALIGNMENT_FREQ, MODEL_FREQ, SUBSAMPLE
)
from voxcommunis.decoder import UniqueSegmentFeature, UniversalUniqueSegmentFeature, FeatureDecoder
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

vc_dir = Path(DATA_DIR) / "VoxCommunis"

split = "train"
alignments_dir = vc_dir / split / "alignments"
manifests_dir = vc_dir / split / "manifests"

In [43]:
def create_submanifest(lang, src_filepath, new_filepath, max_duration=3600):
    manifest = read_manifest(src_filepath)
    # Get the prefix from the first line of the manifest file
    with open(src_filepath) as f:
        lines = f.readlines()
    prefix = Path(lines[0].strip().split("\t")[0])
    # Initialize with the prefix line
    new_lines = [str(prefix) + '\n']

    #shuffle the samples
    sample_ids = list(manifest.keys())
    random.shuffle(sample_ids)

    # get 1h of samples if possible
    tot_duration = 0
    while (tot_duration < max_duration) and sample_ids:
        sample_id = sample_ids.pop()
        full_path, frames = manifest[sample_id]
        tot_duration += int(frames) / SAMPLE_RATE # convert frames to seconds
        new_line = str(full_path.relative_to(prefix)) + f"\t{frames}\n"
        new_lines.append(new_line)
    print(f"For lang {lang}, {len(new_lines)} samples, total duration: {int(tot_duration // 3600)} h {int((tot_duration % 3600) // 60)} min")
    # Write the new manifest file
    with open(new_filepath, "w") as f:
        f.writelines(new_lines)

# Create multilingual 1h/lang manifests

In [None]:
split = "train"
src_manifests_dir = vc_dir / split / "manifests"
langs = list(set(e.stem for e in list(src_manifests_dir.glob("*.tsv"))))

dest_manifests_dir = vc_dir / "train-1h" / "manifests"
dest_manifests_dir.mkdir(exist_ok=True, parents=True)
max_duration = 3600  # 1 hour in seconds

for lang in langs:
    src_filepath = src_manifests_dir / f"{lang}.tsv"
    new_filepath = dest_manifests_dir / f"{lang}.tsv"
    #create_submanifest(lang, src_filepath, new_filepath, max_duration=max_duration)

In [45]:
split = "dev"
src_manifests_dir = vc_dir / split / "manifests"
langs = list(set(e.stem for e in list(src_manifests_dir.glob("*.tsv"))))

dest_manifests_dir = vc_dir / "dev-1h" / "manifests"
dest_manifests_dir.mkdir(exist_ok=True, parents=True)
max_duration = 1200  # 20 minutes

for lang in langs:
    src_filepath = src_manifests_dir / f"{lang}.tsv"
    new_filepath = dest_manifests_dir / f"{lang}.tsv"
    #create_submanifest(lang, src_filepath, new_filepath,
    #                    max_duration=max_duration)

In [47]:
split = "test"
src_manifests_dir = vc_dir / split / "manifests"
langs = list(set(e.stem for e in list(src_manifests_dir.glob("*.tsv"))))

dest_manifests_dir = vc_dir / "test-1h" / "manifests"
dest_manifests_dir.mkdir(exist_ok=True, parents=True)
max_duration = 1200  # 20 minutes

for lang in langs:
    src_filepath = src_manifests_dir / f"{lang}.tsv"
    new_filepath = dest_manifests_dir / f"{lang}.tsv"
    #create_submanifest(lang, src_filepath, new_filepath,
    #                    max_duration=max_duration)

# Create monolingual 20h manifests

In [None]:
split = "train"
src_manifests_dir = vc_dir / split / "manifests"

dest_manifests_dir = vc_dir / "train-20h" / "manifests"
dest_manifests_dir.mkdir(exist_ok=True, parents=True)
max_duration = 20 * 3600  # 20 hour in seconds

langs = ['fr', 'eu', 'it', 'sw', 'hu', 'zh-CN', 'ru']
for lang in langs:
    src_filepath = src_manifests_dir / f"{lang}.tsv"
    new_filepath = dest_manifests_dir / f"{lang}.tsv"
    #create_submanifest(lang, src_filepath, new_filepath, max_duration=max_duration)

In [None]:
split = "dev"
src_manifests_dir = vc_dir / split / "manifests"

dest_manifests_dir = vc_dir / "dev-20h" / "manifests"
dest_manifests_dir.mkdir(exist_ok=True, parents=True)
max_duration = 2 * 3600  # 2 hour in seconds

langs = ['fr', 'eu', 'it', 'sw', 'hu', 'zh-CN', 'ru']
for lang in langs:
    src_filepath = src_manifests_dir / f"{lang}.tsv"
    new_filepath = dest_manifests_dir / f"{lang}.tsv"
    #create_submanifest(lang, src_filepath, new_filepath,
    #                   max_duration=max_duration)

For lang fr, 1240 samples, total duration: 2 h 0 min
For lang eu, 1223 samples, total duration: 2 h 0 min
For lang it, 1269 samples, total duration: 2 h 0 min
For lang sw, 1284 samples, total duration: 2 h 0 min
For lang hu, 1363 samples, total duration: 2 h 0 min
For lang zh-CN, 1331 samples, total duration: 2 h 0 min
For lang ru, 1337 samples, total duration: 2 h 0 min


In [49]:
split = "test"
src_manifests_dir = vc_dir / split / "manifests"

dest_manifests_dir = vc_dir / "test-20h" / "manifests"
dest_manifests_dir.mkdir(exist_ok=True, parents=True)
max_duration = 2 * 3600  # 20 hour in seconds

langs = ['fr', 'eu', 'it', 'sw', 'hu', 'zh-CN', 'ru']
for lang in langs:
    src_filepath = src_manifests_dir / f"{lang}.tsv"
    new_filepath = dest_manifests_dir / f"{lang}.tsv"
    create_submanifest(lang, src_filepath, new_filepath,
                                max_duration=max_duration)

For lang fr, 1238 samples, total duration: 2 h 0 min
For lang eu, 1184 samples, total duration: 2 h 0 min
For lang it, 1200 samples, total duration: 2 h 0 min
For lang sw, 1302 samples, total duration: 2 h 0 min
For lang hu, 1291 samples, total duration: 2 h 0 min
For lang zh-CN, 1216 samples, total duration: 2 h 0 min
For lang ru, 1302 samples, total duration: 2 h 0 min


# Create alignments for the new manifests

In [50]:
def create_subalignments(manifest_fp, src_ali_fp, dest_ali_fp):
    manifest = read_manifest(manifest_fp)
    alignment = read_alignment(src_ali_fp)
    new_lines = []
    for sample_id in manifest.keys():
        if sample_id in alignment:
            new_line = f"{sample_id}\t"
            new_line += ' '.join(alignment[sample_id].split(' ')) + '\n'
            new_lines.append(new_line)
        else:
            print(f"Warning: {sample_id} not in alignment file {src_ali_fp}")
            continue
        
    with open(dest_ali_fp, "w") as f:
        f.writelines(new_lines)

In [56]:
import os

manifests_dir = vc_dir / "test-1h" / "manifests"
alignments_dir = vc_dir / "test-1h" / "alignments"
os.makedirs(alignments_dir, exist_ok=True)
src_alignments_dir = vc_dir / "test" / "alignments"
for man in manifests_dir.glob("*.tsv"):
    lang = man.stem
    #print(f"Processing language: {lang}")
    #create_subalignments(manifests_dir / f"{lang}.tsv",
    #                     src_alignments_dir / f"{lang}.align",
    #                     alignments_dir / f"{lang}.align")
    

# Global info

In [None]:
split = "test"
src_manifests_dir = vc_dir / split / "manifests"
langs = list(set(e.stem for e in list(src_manifests_dir.glob("*.tsv"))))

durations = []
for lang in langs:
    manifest = read_manifest(src_manifests_dir / f"{lang}.tsv")
    sample_ids = list(manifest.keys())

    tot_duration = 0
    while sample_ids:
        sample_id = sample_ids.pop()
        full_path, frames = manifest[sample_id]
        tot_duration += int(frames) / SAMPLE_RATE # convert frames to seconds
    durations.append(tot_duration)

In [None]:
hours_df = pd.DataFrame(np.array([langs, durations]).T, columns=["lang", "duration"])
hours_df["duration"] = hours_df["duration"].astype(float) / 3600  # convert to hours
hours_df.sort_values(by="duration", ascending=False, inplace=True)
hours_df.head(30)

In [None]:
def get_manifest_df(fp):
    with open(fp, "r") as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    dataset_prefix = lines[0]
    lines = [line.split("\t") for line in lines[1:]]
    manifest_df = pd.DataFrame(lines, columns=["wav_fp", "frames"])
    manifest_df["sample_id"] = manifest_df["wav_fp"].apply(lambda x: Path(x).stem)
    manifest_df["wav_fp"] = manifest_df["wav_fp"].apply(lambda x: dataset_prefix + '/' + x)
    manifest_df = manifest_df[["sample_id", "wav_fp", "frames"]]
    return manifest_df

manifest_df = get_manifest_df(manifests_dir / f"{lang}.tsv")

## Phonemes

In [None]:
import pickle
from panphon import FeatureTable
from text.converters import traits_list

ft = FeatureTable()
raw_inventories_dir = vc_dir / "raw_inventories"
clean_inventories_dir = vc_dir / "clean_inventories"

In [None]:
phoneme_superset = set()
langs = list(set(e.stem for e in list(raw_inventories_dir.glob("*.pickle"))))
for lang in langs:
    raw_inventory = pickle.load(open(raw_inventories_dir / f"{lang}.pickle", "rb"))
    clean_inventory = pickle.load(open(clean_inventories_dir / f"{lang}.pickle", "rb"))
    phoneme_superset.update(clean_inventory)

In [None]:
len(phoneme_superset)

In [None]:
panphon_inventory = PanPhonInventory()

for e in list(raw_inventory):
    if e != "SIL":
        e_bis = panphon_inventory.convert_to_ipa(e)[0]
        assert ft.word_array(traits_list, e_bis).shape[0] == 1, f"Error in {e} -> {e_bis}"

In [None]:
import panphon

ft = panphon.FeatureTable()

In [None]:
len(ft.segments[0][1].numeric())

In [None]:
len(ft.segments)

In [None]:
uusf = UniversalUniqueSegmentFeature(sum_diphthong=True)
len(uusf.unique_segments)


## Phoneme Decoder

# Manipulate data

In [58]:
from voxcommunis.io import read_manifest, read_alignment
from paths import DATA_DIR
from pathlib import Path
import numpy as np

import torch

from voxcommunis.decoder import (UniversalUniqueSegmentFeature,
                                 UniqueSegmentFeature,
                                 FeatureDecoder
                                )
from voxcommunis.data import FeatureTokenizer, PhoneticFeatureDataset, SUBSAMPLE, PanPhonInventory
from voxcommunis.utils import unique_consecutive


vc_dir = Path(DATA_DIR) / "VoxCommunis"

lang = "it"
manifest_fp = vc_dir / "train-20h" / "manifests" / f"{lang}.tsv"
alignment_fp = vc_dir / "train" / "alignments" / f"{lang}.align"
encoded_dir = vc_dir / "encoded_audio_multi" / lang
emasrc_dir = encoded_dir / "emasrc"
spk_emb_dir = encoded_dir / "spk_preemb"

manifest = read_manifest(manifest_fp)
alignment = read_alignment(alignment_fp)


uusf = UniversalUniqueSegmentFeature(sum_diphthong=True)
usf = UniqueSegmentFeature(uusf.unique_segments, sum_diphthong=True)
fd = FeatureDecoder(sum_diphthong=True)

In [None]:
sample_ids = list(manifest.keys())
sample_id = sample_ids[0]

emasrc = np.load(emasrc_dir / f"{sample_id}.npy")[:,:14]
spk_preemb = np.load(spk_emb_dir / f"{sample_id}.npy")
segments = alignment[sample_id].split(" ")

#process phonemes
tokenizer = FeatureTokenizer(fd)
segments_sub = segments[::SUBSAMPLE]
ipa_phones, counts = unique_consecutive(segments_sub, return_counts=True)
feature_tensor, phones = tokenizer.encode(ipa_phones, counts)
sil_trait = (feature_tensor == 0).all(axis=1) * 2 - 1 # 1 for sil, -1 for non-sil
feature_tensor25 = torch.concat([feature_tensor, sil_trait.unsqueeze(1)], dim=1)

In [24]:
manifests_dir = vc_dir / "train-20h" / "manifests"
alignments_dir = vc_dir / "train-20h" / "alignments"

truc = PhoneticFeatureDataset(
    manifest_path=manifests_dir,
    alignment_path=alignments_dir,
    feature_tokenizer=tokenizer,
    separate_files=True,
)
phon_features, phones = truc[0]
print(phon_features.shape, len(phones))

torch.Size([61, 26]) 61
