In [37]:
import csv

meta_table = []

with open('meta_table.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        meta_table += [row]

lang_to_sizes = {}
orig_lang = {}

for name, size_str, _, _, lang in meta_table[1:]:
    # Manually handle chinese
    if lang in ["zh-TW", "zh-HK", "zht", "zh-tw"]:
        orig_lang[name] = lang
        lang = "zht"
    if lang in ["zh", "zhs", "zh-CN", "zh-cn"]:
        orig_lang[name] = lang
        lang = "zhs"
    if lang.startswith('zh') and lang not in ["zhs", "zht"]:
        raise ValueError(f"Chinese language needs to be either zhs or zht. Got {lang}")
    
    # Skip nigercongo as it was agregated in a single dataset.
    if lang.startswith("nigercongo"):
        continue

    if name in [
        "s2orc",
        "the_pile_enron_emails",
        "twitter_covid19_indonesia_sentiment_analysis",
        "lot_of_indic_tweets",
        "the_pile_stack_exchange",
        "multilingual_knowledge_questions_answers",
        "a_million_news_headlines_abc_australia",  # pseudocrawl
        "india_news_headlines_dataset",
    ]:
        continue

    if name == "opus100" and lang in [
        "en",
        "zh",
        "fr",
        "es",
    ]:
        continue

    size = int(size_str)
    if size > 0:
        lang_to_sizes[lang] = lang_to_sizes.get(lang, []) + [(name, size)]

lang_to_sizes["ar"] += [
    ("pseudocrawl-filtered_595_mawdoo3_com", int(2.2e9)),
]
lang_to_sizes["en"] += [
    ("s2orc_ai2_pdf_parses", int(273e9)),
    ('pseudocrawl-filtered_689_www_abc_net_au', 2.80e9),
    ('pseudocrawl-filtered_510_timesofindia_indiatimes_com', 2.14e9),
    ('pseudocrawl-filtered_497_www_straitstimes_com', 1.39e9),
    ('pseudocrawl-filtered_534_www_nairaland_com', 0.23e9),
    ('pseudocrawl-filtered_638_globalvoices_org', 0.74e9),
    ('pseudocrawl-filtered_498_www_channelnewsasia_com', 0.51e9),
    ('pseudocrawl-filtered_339_www_actasanitaria_com', 0.05e9),
    ('pseudocrawl-filtered_304_www_semana_com', 0.43e9),
    ('pseudocrawl-filtered_501_theindependent_sg', 0.32e9),
    ('pseudocrawl-filtered_159_www_postcrescent_com', 0.23e9),
    ('pseudocrawl-filtered_487_thesmartlocal_com', 0.23e9),
    ('pseudocrawl-filtered_696_www_oercommons_org', 0.14e9),
    ('pseudocrawl-filtered_470_forums_hardwarezone_com_sg', 0.14e9),
    ('pseudocrawl-filtered_395_www_evwind_es', 0.15e9),
    ('pseudocrawl-filtered_548_remezcla_com', 0.13e9),
    ('pseudocrawl-filtered_485_blog_moneysmart_sg', 0.12e9),
    ('pseudocrawl-filtered_619_www_qut_edu_au', 0.09e9),
    ('pseudocrawl-filtered_499_www_today_com_news', 0.11e9),
    ('pseudocrawl-filtered_492_www_vivawoman_net', 0.11e9),
    ('pseudocrawl-filtered_483_alvinology_com', 0.08e9),
    ('pseudocrawl-filtered_502_www_ricemedia_co', 0.08e9),
    ('pseudocrawl-filtered_488_dailyvanity_sg', 0.05e9),
    ('pseudocrawl-filtered_500_www_asiaone_com_singapore', 0.06e9),
]
lang_to_sizes["es"] += [
    ('pseudocrawl-filtered_100_www_aporrea_org', 3.44e9),
    ('pseudocrawl-filtered_396_www_eldiario_es', 2.99e9),
    ('pseudocrawl-filtered_341_es_cointelegraph_com', 2.74e9),
    ('pseudocrawl-filtered_20_www_clarin_com', 2.65e9),
    ('pseudocrawl-filtered_424_www_lavanguardia_com', 2.19e9),
    ('pseudocrawl-filtered_63_www_lanacion_com_ar', 2.15e9),
    ('pseudocrawl-filtered_255_elcomercio_pe', 1.97e9),
    ('pseudocrawl-filtered_333_www_elmundo_es', 1.83e9),
    ('pseudocrawl-filtered_349_www_eltiempo_com', 1.76e9),
    ('pseudocrawl-filtered_211_www_elcomercio_com', 1.60e9),
    ('pseudocrawl-filtered_267_www_elperiodico_com_es', 1.55e9),
    ('pseudocrawl-filtered_198_www_eleconomista_es', 1.39e9),
    ('pseudocrawl-filtered_181_noticiassin_com', 1.46e9),
    ('pseudocrawl-filtered_409_www_proceso_com_mx', 1.39e9),
    ('pseudocrawl-filtered_354_www_lagaceta_com_ar', 1.44e9),
    ('pseudocrawl-filtered_263_www_lasexta_com', 1.40e9),
    ('pseudocrawl-filtered_641_es_globalvoices_org', 0.82e9),
    ('pseudocrawl-filtered_146_www_perfil_com', 1.31e9),
    ('pseudocrawl-filtered_215_www_lainformacion_com', 1.38e9),
    ('pseudocrawl-filtered_518_www_elcolombiano_com', 1.02e9),
    ('pseudocrawl-filtered_189_www_eleconomista_com_mx', 1.24e9),
    ('pseudocrawl-filtered_429_cadenaser_com', 1.30e9),
    ('pseudocrawl-filtered_103_www_elmostrador_cl', 1.21e9),
    ('pseudocrawl-filtered_237_www_cronista_com', 0.94e9),
    ('pseudocrawl-filtered_248_www_telesurtv_net', 1.02e9),
    ('pseudocrawl-filtered_136_valenciaplaza_com', 1.04e9),
    ('pseudocrawl-filtered_245_www_noticiasdenavarra_com', 0.94e9),
    ('pseudocrawl-filtered_71_www_rtve_es', 0.66e9),
    ('pseudocrawl-filtered_229_www_expansion_com', 0.50e9),
    ('pseudocrawl-filtered_58_www_levante_emv_com', 0.85e9),
    ('pseudocrawl-filtered_157_www_elsoldemexico_com_mx', 0.89e9),
    ('pseudocrawl-filtered_299_www_lne_es', 0.81e9),
    ('pseudocrawl-filtered_79_www_laopiniondemurcia_es', 0.78e9),
    ('pseudocrawl-filtered_67_www_elpais_cr', 0.63e9),
    ('pseudocrawl-filtered_86_www_motorpasion_com', 0.71e9),
    ('pseudocrawl-filtered_373_www_farodevigo_es', 0.71e9),
    ('pseudocrawl-filtered_249_www_telecinco_es', 0.75e9),
    ('pseudocrawl-filtered_287_www_cibercuba_com', 0.73e9),
    ('pseudocrawl-filtered_288_www_marca_com', 0.65e9),
    ('pseudocrawl-filtered_675_www_elespectador_com', 0.59e9),
    ('pseudocrawl-filtered_213_www_hola_com', 0.61e9),
    ('pseudocrawl-filtered_209_misionesonline_net', 0.67e9),
    ('pseudocrawl-filtered_430_www_eldiario_ec', 0.64e9),
    ('pseudocrawl-filtered_381_www_cuartopoder_es', 0.66e9),
    ('pseudocrawl-filtered_324_gestion_pe', 0.64e9),
    ('pseudocrawl-filtered_203_www_que_es', 0.62e9),
    ('pseudocrawl-filtered_226_www_ole_com_ar', 0.55e9),
    ('pseudocrawl-filtered_405_www_emol_com', 0.62e9),
    ('pseudocrawl-filtered_90_peru_com', 0.47e9),
    ('pseudocrawl-filtered_257_www_diaridetarragona_com', 0.59e9),
    ('pseudocrawl-filtered_392_www_muypymes_com', 0.45e9),
    ('pseudocrawl-filtered_317_diariocorreo_pe', 0.55e9),
    ('pseudocrawl-filtered_253_www_debate_com_mx', 0.50e9),
    ('pseudocrawl-filtered_359_www_efeverde_com', 0.54e9),
    ('pseudocrawl-filtered_28_www_fayerwayer_com', 0.52e9),
    ('pseudocrawl-filtered_78_www_listindiario_com', 0.53e9),
    ('pseudocrawl-filtered_256_www_laprovincia_es', 0.54e9),
    ('pseudocrawl-filtered_232_tn_com_ar', 0.48e9),
    ('pseudocrawl-filtered_165_www_ticbeat_com', 0.55e9),
    ('pseudocrawl-filtered_401_www_elperiodicodemexico_com', 0.55e9),
    ('pseudocrawl-filtered_277_www_entornointeligente_com', 0.36e9),
    ('pseudocrawl-filtered_130_www_elperiodicomediterraneo_com', 0.49e9),
    ('pseudocrawl-filtered_44_ladiaria_com_uy', 0.48e9),
    ('pseudocrawl-filtered_220_www_vanguardia_com_mx', 0.48e9),
    ('pseudocrawl-filtered_376_www_elpopular_com_ar', 0.49e9),
    ('pseudocrawl-filtered_169_www_el_carabobeno_com', 0.45e9),
    ('pseudocrawl-filtered_315_lasillavacia_com', 0.42e9),
    ('pseudocrawl-filtered_320_www_paginasiete_bo', 0.46e9),
    ('pseudocrawl-filtered_422_www_formulatv_com', 0.31e9),
    ('pseudocrawl-filtered_116_www_latribuna_hn', 0.37e9),
    ('pseudocrawl-filtered_431_www_elperiodicoextremadura_com', 0.42e9),
    ('pseudocrawl-filtered_91_www_diario26_com', 0.43e9),
    ('pseudocrawl-filtered_325_www_laprensa_hn', 0.34e9),
    ('pseudocrawl-filtered_53_www_expreso_ec', 0.39e9),
    ('pseudocrawl-filtered_207_elimpulso_com', 0.24e9),
    ('pseudocrawl-filtered_404_www_telam_com_ar', 0.39e9),
    ('pseudocrawl-filtered_276_radio_uchile_cl', 0.32e9),
    ('pseudocrawl-filtered_172_www_rionegro_com_ar', 0.36e9),
    ('pseudocrawl-filtered_167_www_ambientum_com', 0.36e9),
    ('pseudocrawl-filtered_21_www_elperiodicodearagon_com', 0.36e9),
    ('pseudocrawl-filtered_250_www_cooperativa_cl', 0.34e9),
    ('pseudocrawl-filtered_34_www_losandes_com_ar', 0.35e9),
    ('pseudocrawl-filtered_231_ojo_pe', 0.32e9),
    ('pseudocrawl-filtered_417_www_radiolaprimerisima_com', 0.31e9),
    ('pseudocrawl-filtered_246_www_eldiarionuevodia_com_ar', 0.34e9),
    ('pseudocrawl-filtered_406_www_americaeconomia_com', 0.31e9),
    ('pseudocrawl-filtered_125_www_noticiasde_es', 0.24e9),
    ('pseudocrawl-filtered_223_www_eltambor_es', 0.33e9),
    ('pseudocrawl-filtered_23_www_elconfidencialdigital_com', 0.30e9),
    ('pseudocrawl-filtered_286_www_nacion_com', 0.30e9),
    ('pseudocrawl-filtered_182_correodelsur_com', 0.26e9),
    ('pseudocrawl-filtered_367_elcorreoweb_es', 0.31e9),
    ('pseudocrawl-filtered_118_www_elheraldo_hn', 0.27e9),
    ('pseudocrawl-filtered_294_www_laopinion_com_co', 0.28e9),
    ('pseudocrawl-filtered_386_www_prensalibre_com', 0.27e9),
    ('pseudocrawl-filtered_233_www_dinero_com', 0.27e9),
    ('pseudocrawl-filtered_254_diario_mx', 0.27e9),
    ('pseudocrawl-filtered_374_www_talcualdigital_com', 0.25e9),
    ('pseudocrawl-filtered_219_www_aguasresiduales_info', 0.23e9),
    ('pseudocrawl-filtered_420_www_retema_es', 0.24e9),
    ('pseudocrawl-filtered_244_www_df_cl', 0.20e9),
    ('pseudocrawl-filtered_32_www_elexpresso_com', 0.21e9),
    ('pseudocrawl-filtered_153_financialfood_es', 0.23e9),
    ('pseudocrawl-filtered_158_www_diariodeleon_es', 0.22e9),
    ('pseudocrawl-filtered_56_www_eluniverso_com', 0.23e9),
    ('pseudocrawl-filtered_62_www_lapagina_com_sv', 0.20e9),
    ('pseudocrawl-filtered_280_salamancartvaldia_es', 0.20e9),
    ('pseudocrawl-filtered_30_www_radiocable_com', 0.21e9),
]
lang_to_sizes["eu"] += [
    ('pseudocrawl-filtered_635_www_berria_eus', 0.23e9),
    ('pseudocrawl-filtered_637_www_argia_eus', 0.15e9),
    ('pseudocrawl-filtered_506_goiena_eus', 0.11e9),
    ('pseudocrawl-filtered_563_ahotsak_eus', 0.08e9),
]
lang_to_sizes["fr"] += [
    ("hal_archives_ouvertes", int(87e9)),
    ("pseudocrawl-filtered_550_www_lemonde_fr", int(5e9)),
    ("pseudocrawl-filtered_530_www_mediapart_fr", int(0.8e9)),
    ("pseudocrawl-filtered_599_fr_globalvoices_org", int(0.5e9)),
]
lang_to_sizes["id"] += [
    ('pseudocrawl-filtered_549_www_cnnindonesia_com', 0.54e9),
    ('pseudocrawl-filtered_572_tirto_id', 0.62e9),
    ('pseudocrawl-filtered_545_www_detik_com', 0.31e9),
    ('pseudocrawl-filtered_512_kumparan_com', 0.21e9),
]
lang_to_sizes["indic-hi"] += [
    ('pseudocrawl-filtered_667_www_bhaskar_com', 1.45e9),
    ('pseudocrawl-filtered_515_www_aajtak_in', 0.49e9),
]
lang_to_sizes["pt"] += [
    ('pseudocrawl-filtered_672_pt_globalvoices_org', 0.22e9),
]
lang_to_sizes["zhs"] += [
    ('pseudocrawl-filtered_503_www_zaobao_com_sg', 0.28e9),
    ('pseudocrawl-filtered_674_ai_baidu_com', 0.06e9),
]


nigercongo_sizes = [
    ('sw', 264036370),
    ('yo', 70039249),
    ('rw', 42145941),
    ('xh', 14467934),
    ('ig', 12520386),
    ('zu', 8959591),
    ('sn', 7489361),
    ('lg', 4538572),
    ('wo', 3526807),
    ('rn', 3216020),
    ('fon', 1973909),
    ('nso', 1932754),
    ('ln', 1631248),
    ('tn', 1533108),
    ('tw', 1241938),
    ('ny', 1194349),
    ('st', 754055),
    ('ts', 725380),
    ('ak', 670157),
    ('bm', 385450),
    ('ki', 362345),
    ('tum', 179586),
]

for ln, ct in nigercongo_sizes:
    lang_to_sizes[f"nigercongo-{ln}"] = [(f"aggregated", ct)]

for lang in lang_to_sizes:
    lang_to_sizes[lang] = sorted(lang_to_sizes[lang], key=lambda x:x[1], reverse=True)
    
lang_to_sizes

{'ar': [('openiti', 17401602240),
  ('arabic_billion_words', 16870246181),
  ('openiti_proc', 14213998620),
  ('uncorpus', 5173513262),
  ('open_subtitles', 3691899354),
  ('wikipedia', 2924624512),
  ('multi_un_2', 2670175122),
  ('pseudocrawl-filtered_595_mawdoo3_com', 2200000000),
  ('wikisource_filtered', 1256182810),
  ('tashkeela', 1081110237),
  ('sanad', 581959127),
  ('ksucca', 460038217),
  ('brad_2', 396674847),
  ('osac', 238021605),
  ('kalimat', 94725838),
  ('ted_talks_iwslt', 93467133),
  ('opus100', 92706048),
  ('wiktionary_filtered', 56547744),
  ('labr', 38192464),
  ('wikinews_filtered', 37951837),
  ('habibi', 30257750),
  ('wikibooks_filtered', 14120300),
  ('wikiquote_filtered', 12225315),
  ('qedcorpus', 10622059),
  ('arabench', 7837368),
  ('wikiversity_filtered', 5353500)],
 'ca': [('catalan_textual_corpus', 10890745832),
  ('catalan_general_crawling', 2689414189),
  ('wikipedia', 1822564157),
  ('tecla', 322436076),
  ('catalan_government_crawling', 2582274

In [38]:
lang_aggreg = sorted(
    [(ln, sum([ct for n, ct in ls])) for ln, ls in lang_to_sizes.items()],
    key=lambda x:x[1],
    reverse=True,
)

for ln, ct in lang_aggreg:
    print(f"{ln:9s} - {ct / 1e9:.3f} GB")

en        - 797.914 GB
zhs       - 214.224 GB
code      - 188.094 GB
fr        - 126.752 GB
es        - 99.346 GB
ar        - 69.654 GB
id        - 52.742 GB
vi        - 36.370 GB
indic-hi  - 29.491 GB
pt        - 24.853 GB
indic-bn  - 20.755 GB
ca        - 16.585 GB
indic-ta  - 11.417 GB
indic-ml  - 5.985 GB
indic-te  - 5.747 GB
indic-kn  - 4.767 GB
indic-ne  - 4.090 GB
indic-ur  - 3.806 GB
indic-mr  - 3.531 GB
indic-pa  - 2.996 GB
indic-gu  - 2.581 GB
eu        - 2.324 GB
indic-or  - 1.271 GB
zht       - 0.946 GB
indic-as  - 0.315 GB
nigercongo-sw - 0.264 GB
nigercongo-yo - 0.070 GB
nigercongo-rw - 0.042 GB
nigercongo-xh - 0.014 GB
nigercongo-ig - 0.013 GB
nigercongo-zu - 0.009 GB
nigercongo-sn - 0.007 GB
nigercongo-lg - 0.005 GB
nigercongo-wo - 0.004 GB
nigercongo-rn - 0.003 GB
nigercongo-fon - 0.002 GB
nigercongo-nso - 0.002 GB
nigercongo-ln - 0.002 GB
nigercongo-tn - 0.002 GB
nigercongo-tw - 0.001 GB
nigercongo-ny - 0.001 GB
nigercongo-st - 0.001 GB
nigercongo-ts - 0.001 GB
nigerc

In [39]:
# Thomas: My implementation of alpha sampling
from typing import List, Tuple
alpha = 0.3

# Compute alpha sampling on languages https://arxiv.org/pdf/1901.07291.pdf
def alpha_probabilities(language_sizes: List[Tuple[str, int]], alpha: float) -> List[Tuple[str, float]]:
    # canonical probabilities
    canonical_normalisation_factor = sum([elt[1] for elt in language_sizes])
    canonical_probabilities = [(elt[0], elt[1]/canonical_normalisation_factor) for elt in language_sizes]
    
    # alpha 
    alpha_normalisation_factor = sum([elt[1] ** alpha for elt in canonical_probabilities])
    alpha_probabilities = [(elt[0], elt[1] ** alpha /alpha_normalisation_factor) for elt in canonical_probabilities]
    return alpha_probabilities, canonical_probabilities

alpha_probabilities, canonical_probabilities = alpha_probabilities(lang_aggreg, alpha)
for (lang, alpha), (_, canon) in zip(alpha_probabilities, canonical_probabilities):
    print(f"{lang:15s} - {alpha:.2e}")

en              - 1.09e-01 4.62e-01
zhs             - 7.32e-02 1.24e-01
code            - 7.04e-02 1.09e-01
fr              - 6.25e-02 7.34e-02
es              - 5.81e-02 5.75e-02
ar              - 5.23e-02 4.03e-02
id              - 4.81e-02 3.05e-02
vi              - 4.30e-02 2.11e-02
indic-hi        - 4.04e-02 1.71e-02
pt              - 3.84e-02 1.44e-02
indic-bn        - 3.63e-02 1.20e-02
ca              - 3.40e-02 9.60e-03
indic-ta        - 3.04e-02 6.61e-03
indic-ml        - 2.50e-02 3.47e-03
indic-te        - 2.47e-02 3.33e-03
indic-kn        - 2.34e-02 2.76e-03
indic-ne        - 2.23e-02 2.37e-03
indic-ur        - 2.18e-02 2.20e-03
indic-mr        - 2.14e-02 2.04e-03
indic-pa        - 2.03e-02 1.73e-03
indic-gu        - 1.94e-02 1.49e-03
eu              - 1.88e-02 1.35e-03
indic-or        - 1.57e-02 7.36e-04
zht             - 1.44e-02 5.48e-04
indic-as        - 1.03e-02 1.82e-04
nigercongo-sw   - 9.81e-03 1.53e-04
nigercongo-yo   - 6.59e-03 4.06e-05
nigercongo-rw   - 5.66e-03 2

In [55]:
# Find which dataset constrains the sampling
def get_constained_language(language_sizes: List[Tuple[str, int]], alpha_probabilities: List[Tuple[str,float]]) -> int:
    selected_index = 0
    found_constrained_language_index = False
    while not found_constrained_language_index:
        selected_size = language_sizes[selected_index][1]
        selected_alpha = alpha_probabilities[selected_index][1]
        found_constrained_language_index = True
        for index, ((lang1, alpha), (lang2, size)) in enumerate(zip(alpha_probabilities, language_sizes)):
            assert lang1 == lang2
            expected_size = int(selected_size * alpha / selected_alpha)
            if expected_size > size:
                selected_index = index
                found_constrained_language_index = False
                break
    return selected_index

constrained_language_index = get_constained_language(lang_aggreg, alpha_probabilities)
print(lang_aggreg[constrained_language_index])

# Sanity check all the sizes are smaller than the expected one
constrained_size = lang_aggreg[constrained_language_index][1]
constrained_alpha = alpha_probabilities[constrained_language_index][1]
constrained_ratio = constrained_size / constrained_alpha
for (lang1, alpha), (lang2, size) in zip(alpha_probabilities, lang_aggreg):
    assert lang1 == lang2
    expected_size = alpha * constrained_ratio
    assert expected_size <= size

('nigercongo-tum', 179586)


In [66]:
# Get accumulated dataset give current alpha ratio
alpha_dataset = [(lang, alpha * constrained_ratio) for (lang, alpha)in alpha_probabilities]
current_size = sum([size for (_, size) in alpha_dataset])

for (lang1, size), (lang2, original_size) in zip(alpha_dataset, lang_aggreg):
    print(f"{lang:15s} - {size / 1e9:.3f} GB - percentage conserved: {size / original_size} - Original size: {original_size / 1e9:.3f} GB")

print(current_size)

nigercongo-tum  - 0.018 GB - percentage conserved: 2.2213723649024766e-05 - Original size: 797.914 GB
nigercongo-tum  - 0.012 GB - percentage conserved: 5.5767781074433704e-05 - Original size: 214.224 GB
nigercongo-tum  - 0.011 GB - percentage conserved: 6.108408190702272e-05 - Original size: 188.094 GB
nigercongo-tum  - 0.010 GB - percentage conserved: 8.052351225631972e-05 - Original size: 126.752 GB
nigercongo-tum  - 0.009 GB - percentage conserved: 9.54963864775861e-05 - Original size: 99.346 GB
nigercongo-tum  - 0.009 GB - percentage conserved: 0.00012244139178653886 - Original size: 69.654 GB
nigercongo-tum  - 0.008 GB - percentage conserved: 0.0001487583331056574 - Original size: 52.742 GB
nigercongo-tum  - 0.007 GB - percentage conserved: 0.00019295967857551043 - Original size: 36.370 GB
nigercongo-tum  - 0.007 GB - percentage conserved: 0.00022346404596399236 - Original size: 29.491 GB
nigercongo-tum  - 0.006 GB - percentage conserved: 0.00025189715841525596 - Original size: 2

In [87]:
# Clip the dataset so that high ressource have the same amount of data
target_size = int(800e9)
target_size = int(160000000)
assert target_size < current_size, f"Current size: {current_size}, and target: {target_size}, {current_size / target_size}"

import numpy as np

# Find breaking point, ie index at which cumulative sum exceeds current_size
increasing_alpha_lang = alpha_dataset[::-1]
tmp_size = 0
previous_size = 0
for i,(_, size) in enumerate(increasing_alpha_lang):
    # We simulate if we add `size` for each language that still have data left.
    tmp_size += i * (size - previous_size)
    previous_size = size
    if tmp_size > target_size:
        breaking_index = i
        clipped_size = size - (tmp_size - target_size) / i

print(clipped_size)
clipped_alpha_dataset = [(lang, min(size, clipped_size)) for (lang, size) in alpha_dataset]
norm = sum([size for (_, size) in clipped_alpha_dataset])
for (lang, size)in clipped_alpha_dataset:
    print(f"{lang:15s} - {size / norm * 800:.1f} GB")

print(f"Target size: {target_size}. Computed size: {sum([size for (_, size) in clipped_alpha_dataset])}")

6641118.405033341
en              - 40.2 GB
zhs             - 40.2 GB
code            - 40.2 GB
fr              - 40.2 GB
es              - 40.2 GB
ar              - 40.2 GB
id              - 40.2 GB
vi              - 40.2 GB
indic-hi        - 39.9 GB
pt              - 37.9 GB
indic-bn        - 35.9 GB
ca              - 33.6 GB
indic-ta        - 30.0 GB
indic-ml        - 24.7 GB
indic-te        - 24.4 GB
indic-kn        - 23.1 GB
indic-ne        - 22.1 GB
indic-ur        - 21.6 GB
indic-mr        - 21.1 GB
indic-pa        - 20.1 GB
indic-gu        - 19.2 GB
eu              - 18.6 GB
indic-or        - 15.5 GB
zht             - 14.2 GB
indic-as        - 10.2 GB
nigercongo-sw   - 9.7 GB
nigercongo-yo   - 6.5 GB
nigercongo-rw   - 5.6 GB
nigercongo-xh   - 4.1 GB
nigercongo-ig   - 3.9 GB
nigercongo-zu   - 3.5 GB
nigercongo-sn   - 3.3 GB
nigercongo-lg   - 2.9 GB
nigercongo-wo   - 2.7 GB
nigercongo-rn   - 2.6 GB
nigercongo-fon  - 2.2 GB
nigercongo-nso  - 2.2 GB
nigercongo-ln   - 2.1 GB
nigerco

In [29]:
target_size = int(800e9)
# target_size = int(600e9)

num_dsets = len(lang_aggreg)

ratios = {}
with_samples = []

# lang_aggreg: List[Tuple[str, int]]
prev_size = 0
current_size = 0
for i, (name, size) in enumerate(sorted(lang_aggreg, key=lambda x:x[1])):
    if current_size < target_size:
        add_size = min(size - prev_size, (target_size - current_size) // (num_dsets - i) + 1)
        current_size += add_size * (num_dsets - i)
        prev_size += add_size
    with_samples += [(name, size, prev_size)]
    
lang_to_target = {}
for ln, _, keep_size in sorted(with_samples, key=lambda x:x[1], reverse=True):
    lang_to_target[ln] = keep_size
    print(f"{ln:10s} - {keep_size / 1e9:.3f} GB")
    
print(f"{sum([elt[1] for elt in with_samples])/1e9:.3f}")

# Quentin version of whatever the previous loop does
prev_size = 0
current_size = 0
with_samples = []
for i, (name, size) in enumerate(sorted(lang_aggreg, key=lambda x:x[1])):
    if prev_size < target_size:
        add_size = min(size, (target_size - current_size) // (num_dsets - i) + 1)
        current_size += add_size * (num_dsets - i)
        prev_size += add_size
    with_samples += [(name, size, prev_size)]

lang_to_target = {}
for ln, _, keep_size in sorted(with_samples, key=lambda x:x[1], reverse=True):
    lang_to_target[ln] = keep_size
    print(f"{ln:10s} - {keep_size / 1e9:.3f} GB")

print(f"{sum([elt[1] for elt in with_samples])/1e9:.3f}")

en         - 99.996 GB
zhs        - 99.996 GB
code       - 99.996 GB
fr         - 99.996 GB
es         - 99.346 GB
ar         - 69.654 GB
id         - 52.742 GB
vi         - 36.370 GB
indic-hi   - 29.491 GB
pt         - 24.853 GB
indic-bn   - 20.755 GB
ca         - 16.585 GB
indic-ta   - 11.417 GB
indic-ml   - 5.985 GB
indic-te   - 5.747 GB
indic-kn   - 4.767 GB
indic-ne   - 4.090 GB
indic-ur   - 3.806 GB
indic-mr   - 3.531 GB
indic-pa   - 2.996 GB
indic-gu   - 2.581 GB
eu         - 2.324 GB
indic-or   - 1.271 GB
zht        - 0.946 GB
indic-as   - 0.315 GB
nigercongo-sw - 0.264 GB
nigercongo-yo - 0.070 GB
nigercongo-rw - 0.042 GB
nigercongo-xh - 0.014 GB
nigercongo-ig - 0.013 GB
nigercongo-zu - 0.009 GB
nigercongo-sn - 0.007 GB
nigercongo-lg - 0.005 GB
nigercongo-wo - 0.004 GB
nigercongo-rn - 0.003 GB
nigercongo-fon - 0.002 GB
nigercongo-nso - 0.002 GB
nigercongo-ln - 0.002 GB
nigercongo-tn - 0.002 GB
nigercongo-tw - 0.001 GB
nigercongo-ny - 0.001 GB
nigercongo-st - 0.001 GB
nigercongo

In [9]:
tokenization_ratios = []

total_size = 0

for lang, sizes in lang_to_sizes.items():
    ln_target_size = lang_to_target[lang]
    current_size = 0
    prev_size = 0
    num_dsets = len(sizes)
    for i, (name, size) in enumerate(sorted(sizes, key=lambda x:x[1])):
        if current_size < ln_target_size:
            add_size = min(size - prev_size, (ln_target_size - current_size) // (num_dsets - i) + 1)
            current_size += add_size * (num_dsets - i)
            prev_size += add_size
        ln = orig_lang.get(name, lang) if lang == "zh" else lang
        tokenization_ratios += [
            {
                'dataset_path': f"bigscience-catalogue-lm-data/lm_{ln}_{name}",
                'is_catalogue': True,
                'ratio': prev_size / target_size
            }]
    total_size += current_size

for dct in sorted(tokenization_ratios, key=lambda dct: dct['ratio'], reverse=True):
    print(f"{dct['ratio']:.2e} - {dct['dataset_path'].split('/')[1]}")

1.12e-01 - lm_zhs_wudaocorpora
8.81e-02 - lm_code_github-no-gpl
7.53e-02 - lm_fr_hal_archives_ouvertes
6.03e-02 - lm_id_indo4b
3.74e-02 - lm_vi_binhvq_news_corpus
3.68e-02 - lm_code_stackexchange
2.24e-02 - lm_pt_brwac
2.18e-02 - lm_ar_openiti
2.11e-02 - lm_ar_arabic_billion_words
1.79e-02 - lm_indic-bn_bangla_lm
1.78e-02 - lm_ar_openiti_proc
1.77e-02 - lm_indic-hi_indic_nlp_corpus
1.61e-02 - lm_fr_wikisource_filtered
1.36e-02 - lm_ca_catalan_textual_corpus
1.21e-02 - lm_indic-hi_iitb_english_hindi_corpus
1.10e-02 - lm_en_wikipedia
1.10e-02 - lm_en_open_subtitles
1.10e-02 - lm_en_project_gutenberg
1.10e-02 - lm_en_the_pile_uspto
1.10e-02 - lm_en_s2orc_ai2_abstracts
1.10e-02 - lm_en_pmc
1.10e-02 - lm_en_arxiv
1.10e-02 - lm_en_s2orc_ai2_pdf_parses
9.56e-03 - lm_indic-ta_indic_nlp_corpus
8.83e-03 - lm_es_open_subtitles
7.29e-03 - lm_en_no_code_stackexchange
6.67e-03 - lm_fr_uncorpus
6.47e-03 - lm_ar_uncorpus
6.25e-03 - lm_fr_pseudocrawl-filtered_550_www_lemonde_fr
5.81e-03 - lm_es_uncorpu

In [10]:
orig_lang

{'du_reader': 'zh',
 'multi_un_2': 'zh',
 'open_subtitles': 'zh',
 'opus100': 'zh',
 'project_gutenberg': 'zh',
 'ted_talks_iwslt': 'zh',
 'uncorpus': 'zh',
 'wudaocorpora': 'zh',
 'multilingual_knowledge_questions_answers': 'zh-TW',
 'qedcorpus': 'zht',
 'bloom': 'zh',
 'wikipedia': 'zh-tw',
 'wikivoyage_filtered': 'zh',
 'wiktionary_filtered': 'zh',
 'wikiversity_filtered': 'zh',
 'wikinews_filtered': 'zh',
 'wikibooks_filtered': 'zh',
 'wikiquote_filtered': 'zh'}

In [11]:
import json
json.dump(tokenization_ratios, open("bigscience_data_v1_ratios_800GB_zh_aggregated.json", "w", encoding="utf-8"), indent=2)

In [12]:
lang_to_target_alphas = {}

tok_total = int(30e9)

lang_to_target_alpha_ratios = {}
for ln, keep_size in lang_to_target.items():
    lang_to_target_alpha_ratios[ln] = (keep_size / target_size) ** 0.3

alpha_tot_ratios = sum(lang_to_target_alpha_ratios.values())
for ln, r in lang_to_target_alpha_ratios.items():
    ln_ratio = r / alpha_tot_ratios
    print(f"{ln:15s} - {ln_ratio:.2e} %")
    lang_to_target_alphas[ln] = ln_ratio * tok_total

en              - 6.34e-02 %
zhs             - 6.34e-02 %
code            - 6.34e-02 %
fr              - 6.34e-02 %
es              - 6.33e-02 %
ar              - 5.69e-02 %
id              - 5.24e-02 %
vi              - 4.68e-02 %
indic-hi        - 4.40e-02 %
pt              - 4.18e-02 %
indic-bn        - 3.96e-02 %
ca              - 3.70e-02 %
indic-ta        - 3.31e-02 %
indic-ml        - 2.73e-02 %
indic-te        - 2.69e-02 %
indic-kn        - 2.55e-02 %
indic-ne        - 2.43e-02 %
indic-ur        - 2.38e-02 %
indic-mr        - 2.33e-02 %
indic-pa        - 2.21e-02 %
indic-gu        - 2.12e-02 %
eu              - 2.05e-02 %
indic-or        - 1.71e-02 %
zht             - 1.57e-02 %
indic-as        - 1.13e-02 %
nigercongo-sw   - 1.07e-02 %
nigercongo-yo   - 7.18e-03 %
nigercongo-rw   - 6.16e-03 %
nigercongo-xh   - 4.47e-03 %
nigercongo-ig   - 4.28e-03 %
nigercongo-zu   - 3.87e-03 %
nigercongo-sn   - 3.67e-03 %
nigercongo-lg   - 3.16e-03 %
nigercongo-wo   - 2.93e-03 %
nigercongo-rn 

In [106]:
tokenization_ratios = []

total_size = 0

for lang, sizes in lang_to_sizes.items():
    ln_target_size = lang_to_target_alphas[lang]
    current_size = 0
    prev_size = 0
    num_dsets = len(sizes)
    for i, (name, size) in enumerate(sorted(sizes, key=lambda x:x[1])):
        if current_size < ln_target_size:
            add_size = min(size - prev_size, (ln_target_size - current_size) // (num_dsets - i) + 1)
            current_size += add_size * (num_dsets - i)
            prev_size += add_size
        tokenization_ratios += [
            {
                'dataset_path': f"bigscience-catalogue-lm-data/lm_{lang}_{name}",
                'is_catalogue': True,
                'ratio': prev_size / target_size
            }]
    total_size += current_size

for dct in sorted(tokenization_ratios, key=lambda dct: dct['ratio'], reverse=True):
    print(f"{dct['ratio']:.2e} - {dct['dataset_path'].split('/')[1]}")

1.16e-03 - lm_code_stackexchange
1.16e-03 - lm_code_github-no-gpl
8.89e-04 - lm_indic-ne_unsupervised_cross_lingual_representation_learning_at_scale
5.72e-04 - lm_zh-tw_wikipedia
5.72e-04 - lm_zh-cn_wikipedia
3.30e-04 - lm_nigercongo-sw_lm_nigercongo-sw_aggregated
2.89e-04 - lm_zh_wiktionary_filtered
2.89e-04 - lm_zh_open_subtitles
2.89e-04 - lm_zh_multi_un_2
2.89e-04 - lm_zh_uncorpus
2.89e-04 - lm_zh_du_reader
2.89e-04 - lm_zh_wudaocorpora
2.77e-04 - lm_indic-as_wikisource_filtered
2.46e-04 - lm_eu_wikipedia
2.46e-04 - lm_eu_bsbasque
2.21e-04 - lm_indic-hi_wikipedia
2.21e-04 - lm_indic-hi_wikisource_filtered
2.21e-04 - lm_indic-hi_samanantar
2.21e-04 - lm_indic-hi_iitb_english_hindi_corpus
2.21e-04 - lm_indic-hi_indic_nlp_corpus
2.12e-04 - lm_indic-bn_opus100
2.12e-04 - lm_indic-bn_wikipedia
2.12e-04 - lm_indic-bn_samanantar
2.12e-04 - lm_indic-bn_indic_nlp_corpus
2.12e-04 - lm_indic-bn_wikisource_filtered
2.12e-04 - lm_indic-bn_bangla_lm
2.12e-04 - lm_indic-kn_wikipedia
2.12e-04 - lm

In [107]:
import json
json.dump(tokenization_ratios, open("bigscience_tokenization_ratios_alpha_30GB.json", "w", encoding="utf-8"), indent=2)

In [108]:
tokenization_ratios = []

total_size = 0

for lang, sizes in lang_to_sizes.items():
    ln_target_size = lang_to_target_alphas[lang]
    current_size = 0
    prev_size = 0
    num_dsets = len(sizes)
    for i, (name, size) in enumerate(sorted(sizes, key=lambda x:x[1])):
        if current_size < ln_target_size:
            add_size = min(size - prev_size, (ln_target_size - current_size) // (num_dsets - i) + 1)
            current_size += add_size * (num_dsets - i)
            prev_size += add_size
        tokenization_ratios += [
            {
                'dataset_path': f"bigscience-catalogue-lm-data/lm_{lang}_{name}",
                'is_catalogue': True,
                'ratio': prev_size / size
            }]
    total_size += current_size

for dct in sorted(tokenization_ratios, key=lambda dct: dct['ratio'], reverse=True):
    print(f"{dct['ratio']:.2e} - {dct['dataset_path'].split('/')[1]}")

1.00e+00 - lm_ar_multilingual_knowledge_questions_answers
1.00e+00 - lm_ar_wikiversity_filtered
1.00e+00 - lm_ar_arabench
1.00e+00 - lm_ar_qedcorpus
1.00e+00 - lm_ar_wikiquote_filtered
1.00e+00 - lm_ar_wikibooks_filtered
1.00e+00 - lm_ar_habibi
1.00e+00 - lm_ar_wikinews_filtered
1.00e+00 - lm_ar_labr
1.00e+00 - lm_ar_wiktionary_filtered
1.00e+00 - lm_ca_xquad_ca
1.00e+00 - lm_ca_sts_ca
1.00e+00 - lm_ca_vilaquad
1.00e+00 - lm_ca_viquiquad
1.00e+00 - lm_ca_wikimedia_filtered
1.00e+00 - lm_ca_enriched_conllu_ancora_for_ml_training
1.00e+00 - lm_ca_teca
1.00e+00 - lm_ca_wikiquote_filtered
1.00e+00 - lm_ca_ted_talks_iwslt
1.00e+00 - lm_ca_wikibooks_filtered
1.00e+00 - lm_ca_wikinews_filtered
1.00e+00 - lm_ca_open_subtitles
1.00e+00 - lm_ca_tv3_parla
1.00e+00 - lm_ca_opus100
1.00e+00 - lm_ca_parlament_parla
1.00e+00 - lm_ca_ca_text_corpus
1.00e+00 - lm_ca_wikisource_filtered
1.00e+00 - lm_en_book_dash_books
1.00e+00 - lm_en_multilingual_knowledge_questions_answers
1.00e+00 - lm_en_scielo
1.0

In [109]:
json.dump(tokenization_ratios, open("bigscience_tokenization_ratios_alpha_30GB_sampling_format.json", "w", encoding="utf-8"), indent=2)

In [None]:
tokenization_ratios = []

per_lang_tot = {}

total_size = 0

for lang, sizes in lang_to_sizes.items():
    target_size = min(int(1.5e11), sum([x[1] for x in sizes]))
    current_size = 0
    prev_size = 0
    num_dsets = len(sizes)
    with_samples = []
    for i, (name, size) in enumerate(sorted(sizes, key=lambda x:x[1])):
        if current_size < target_size:
            add_size = min(size - prev_size, (target_size - current_size) // (num_dsets - i) + 1)
            current_size += add_size * (num_dsets - i)
            prev_size += add_size
        with_samples += [(name, size, prev_size, prev_size/size)]
        tokenization_ratios += [
            {
                'dataset_path': f"bigscience-catalogue-lm-data/lm_{lang}_{name}",
                'is_catalogue': True,
                'ratio': prev_size / size
            }]
    total_size += current_size
    per_lang_tot[lang] = per_lang_tot.get(lang, 0) + current_size
    print(lang, f"{current_size / 1e9:.3f}")

tokenization_ratios += [{
    'dataset_path': f"bigscience-catalogue-lm-data/tokenization_nigercongo",
    'is_catalogue': True,
    'ratio': 1.
}]
per_lang_tot["nigercongo"] = 600000000


print(f"------ Total: {total_size / 1e9:.3f} GB")
tokenization_ratios

In [85]:
import re

id_re = re.compile(r'\b[A-Za-z]*(?:[-\.]*\d){6,}\b')

In [98]:
st = """
the war of 1914-18 was called the Great War
"""

print(re.sub(id_re, "<ID>", st))


the war of <ID> was called the Great War

