In [1]:
import json
import os

In [2]:
acc_folder = "../../data/en/acc_split"
train_folder = "../../data/en/train_split"

In [3]:
acc_results = dict()
for file in os.listdir(acc_folder):
    acc = os.path.splitext(file)[0]
    data = json.load(open(os.path.join(acc_folder, file)))
    n_samples, dur = 0, 0
    for obj in data:
        n_samples += 1
        dur += obj["duration"]
    acc_results[acc] = {
        "n_samples": n_samples,
        "dur": round(dur / 3600, 2)
    }


In [4]:
acc_results

{'de': {'n_samples': 40897, 'dur': 79.01},
 'uk': {'n_samples': 134126, 'dur': 178.34},
 'hk': {'n_samples': 4260, 'dur': 5.78},
 'us': {'n_samples': 382626, 'dur': 520.61},
 'sg': {'n_samples': 3365, 'dur': 4.71},
 'au': {'n_samples': 51108, 'dur': 71.39},
 'ni': {'n_samples': 5968, 'dur': 7.96},
 'in': {'n_samples': 99613, 'dur': 148.05},
 'ca': {'n_samples': 59342, 'dur': 85.63},
 'za': {'n_samples': 8374, 'dur': 11.58},
 'nz': {'n_samples': 11877, 'dur': 15.75},
 'ph': {'n_samples': 5105, 'dur': 7.36},
 'ie': {'n_samples': 9461, 'dur': 12.98},
 'sc': {'n_samples': 15474, 'dur': 24.46}}

In [5]:
train_results = dict()
for file in os.listdir(train_folder):
    prepend, acc = os.path.splitext(file)[0].split("_")
    if acc not in train_results:
        train_results[acc] = dict()
    dur, n_samples = 0, 0
    fname = os.path.join(train_folder, file)
    try:
        for line in open(fname):
            n_samples += 1
            dur += json.loads(line)["duration"]
        train_results
        train_results[acc][prepend] = {
            "n_samples": n_samples,
            "dur": round(dur / 3600, 2)
        }
    except Exception:  # file does not exist
        print(f"{fname} does not exist")
        continue


In [6]:
train_results

{'ni': {'test': {'n_samples': 5968, 'dur': 7.96}},
 'uk': {'train': {'n_samples': 107301, 'dur': 142.74},
  'test': {'n_samples': 26825, 'dur': 35.6}},
 'us': {'test': {'n_samples': 32402, 'dur': 44.13},
  'train': {'n_samples': 129608, 'dur': 176.48},
  'pretrain': {'n_samples': 220616, 'dur': 300.0}},
 'au': {'test': {'n_samples': 51108, 'dur': 71.39}},
 'nz': {'test': {'n_samples': 11877, 'dur': 15.75}},
 'ca': {'train': {'n_samples': 47474, 'dur': 68.57},
  'test': {'n_samples': 11868, 'dur': 17.06}},
 'hk': {'test': {'n_samples': 4260, 'dur': 5.78}},
 'in': {'test': {'n_samples': 19923, 'dur': 29.61},
  'train': {'n_samples': 79690, 'dur': 118.44}},
 'sc': {'test': {'n_samples': 15474, 'dur': 24.46}},
 'sg': {'test': {'n_samples': 3365, 'dur': 4.71}},
 'de': {'train': {'n_samples': 32718, 'dur': 63.19},
  'test': {'n_samples': 8179, 'dur': 15.82}},
 'ph': {'test': {'n_samples': 5105, 'dur': 7.36}},
 'ie': {'test': {'n_samples': 9461, 'dur': 12.98}},
 'za': {'test': {'n_samples': 8

In [7]:
# compute the distribution of data across accents
hours = [0 for _ in range(len(train_results))]
for i, acc in enumerate(train_results):
    for file in train_results[acc].values():
        hours[i] += file["dur"]

for i, acc in enumerate(train_results):
    print(acc, round(hours[i], 1), round(hours[i] / sum(hours), 2))

ni 8.0 0.01
uk 178.3 0.15
us 520.6 0.44
au 71.4 0.06
nz 15.8 0.01
ca 85.6 0.07
hk 5.8 0.0
in 148.1 0.13
sc 24.5 0.02
sg 4.7 0.0
de 79.0 0.07
ph 7.4 0.01
ie 13.0 0.01
za 11.6 0.01
