# Clone only one seed of the dataset

If you want to load another seed, it's here that you'll need to change the id. Be careful, the size of the files corresponding to a seed varies from 2.5K to 48G (compressed).

Run the following code in your command to get the repo set up once:
```
huggingface-cli

git config --global credential.helper store
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/bigscience-catalogue-data/pseudo_crawl

cd pseudo_crawl/
```

You can then download the seeds one at a time (for example here with 102) with:

```
%env SEED_ID=102
!GIT_TRACE=1 git lfs pull -I "*seed_id=$SEED_ID/text__html/*"
```

Specifically for Niger Congo languages:
```
for SEED_ID in 622 698 699 700 701 702 703 704 705
do
    GIT_TRACE=1 git lfs pull -I "*seed_id=$SEED_ID/text__html/*"
done
```

# Install python packages

In [1]:
import os
import datasets
import json

from datasets import load_dataset, Features
from pprint import pprint
from tqdm import tqdm

## Run so utilities to set the features of our dataset

`features` is copy-pasted from `dataset_info.json`

In [2]:
null = None
features = {
    "HtmlPreprocessor_error": {
      "dtype": "int64",
      "id": null,
      "_type": "Value"
    },
    "HtmlPreprocessor_error_comment": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "content_languages": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "content_mime_detected": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "depth": {
      "dtype": "int16",
      "id": null,
      "_type": "Value"
    },
    "download_exception": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "external_urls": [
      {
        "dtype": "string",
        "id": null,
        "_type": "Value"
      }
    ],
    "fetch_redirect": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "fetch_status": {
      "dtype": "int32",
      "id": null,
      "_type": "Value"
    },
    "fetch_time": {
      "dtype": "timestamp[ns]",
      "id": null,
      "_type": "Value"
    },
    "html_error": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "html_footer": [
      {
        "dtype": "string",
        "id": null,
        "_type": "Value"
      }
    ],
    "html_head": [
      {
        "dtype": "string",
        "id": null,
        "_type": "Value"
      }
    ],
    "html_str": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "html_title": [
      {
        "dtype": "string",
        "id": null,
        "_type": "Value"
      }
    ],
    "metadata_html": [
      {
        "char_end_idx": {
          "dtype": "int64",
          "id": null,
          "_type": "Value"
        },
        "char_start_idx": {
          "dtype": "int64",
          "id": null,
          "_type": "Value"
        },
        "html_attrs": {
          "attrs": [
            {
              "dtype": "string",
              "id": null,
              "_type": "Value"
            }
          ],
          "values": [
            {
              "dtype": "string",
              "id": null,
              "_type": "Value"
            }
          ]
        },
        "key": {
          "dtype": "string",
          "id": null,
          "_type": "Value"
        },
        "relative_end_pos": {
          "dtype": "int64",
          "id": null,
          "_type": "Value"
        },
        "relative_start_pos": {
          "dtype": "int64",
          "id": null,
          "_type": "Value"
        },
        "type": {
          "dtype": "string",
          "id": null,
          "_type": "Value"
        },
        "value": {
          "dtype": "string",
          "id": null,
          "_type": "Value"
        }
      }
    ],
    "seed_id": {
      "dtype": "int32",
      "id": null,
      "_type": "Value"
    },
    "text": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "url": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "url_host_name": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "url_host_registered_domain": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "url_host_tld": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "url_surtkey": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "warc_filename": {
      "dtype": "string",
      "id": null,
      "_type": "Value"
    },
    "warc_record_length": {
      "dtype": "int32",
      "id": null,
      "_type": "Value"
    },
    "warc_record_offset": {
      "dtype": "int32",
      "id": null,
      "_type": "Value"
    }
  }

In [3]:
def convert_types(features):
    if isinstance(features, dict) and "_type" in features:
        return getattr(datasets, features["_type"])(features["dtype"])
    elif isinstance(features, dict):
        return {key: convert_types(value) for key, value in features.items()}
    elif isinstance(features, list):
        return [convert_types(value) for value in features]

In [4]:
final_features = convert_types(features)
final_features = Features(final_features)
final_features

{'HtmlPreprocessor_error': Value(dtype='int64', id=None),
 'HtmlPreprocessor_error_comment': Value(dtype='string', id=None),
 'content_languages': Value(dtype='string', id=None),
 'content_mime_detected': Value(dtype='string', id=None),
 'depth': Value(dtype='int16', id=None),
 'download_exception': Value(dtype='string', id=None),
 'external_urls': [Value(dtype='string', id=None)],
 'fetch_redirect': Value(dtype='string', id=None),
 'fetch_status': Value(dtype='int32', id=None),
 'fetch_time': Value(dtype='timestamp[ns]', id=None),
 'html_error': Value(dtype='string', id=None),
 'html_footer': [Value(dtype='string', id=None)],
 'html_head': [Value(dtype='string', id=None)],
 'html_str': Value(dtype='string', id=None),
 'html_title': [Value(dtype='string', id=None)],
 'metadata_html': [{'char_end_idx': Value(dtype='int64', id=None),
   'char_start_idx': Value(dtype='int64', id=None),
   'html_attrs': {'attrs': [Value(dtype='string', id=None)],
    'values': [Value(dtype='string', id=Non

In [5]:
def get_meta_dict(page):
    meta = dict([
        (k, page[k])
        for k in ['url_host_name', 'html_title', 'url', 'content_languages', 'seed_id']
    ])
    return meta

In [6]:
def filter_lines(article, skip_dict):
    lines = [line.strip() for line in article.split('\n')]
    keep = []
    skip = []
    for i, line in enumerate(lines):
        if skip_dict.get(line, False):
            skip += [line]
        else:
            keep += [line]
    return "\n".join(keep), "\n".join(skip)

In [7]:
def process_page(page, skip_dict):
    meta = get_meta_dict(page)
    text, _ = filter_lines(page["text"], skip_dict)
    return {
        "meta": meta,
        "text": text,
    }

In [8]:
def make_dataset_line_counts(seed_id):
    dset = load_dataset(
        "json",
        data_files = [f"pseudo_crawl/seed_id={seed_id}/text__html/*.jsonl.gz"],
        features = final_features,
    )
    line_counts = {}
    for article in tqdm(dset['train'][:10000]["text"]):
        for line in article.split("\n"):
            line_counts[line.strip()] = line_counts.get(line.strip(), 0) + 1
    thres_skip = min(100, dset['train'].num_rows // 100)
    skip_dict = dict([(line, True) for line, ct in line_counts.items() if ct > thres_skip])
    return dset, line_counts, skip_dict

# Make Dataset for 622: Global Voices Swahili

In [9]:
ds_gws, linecounts_gws, skip_dict_gws = make_dataset_line_counts(622)

keep, skip = filter_lines(
    ds_gws['train'][1]["text"],
    skip_dict_gws
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

Using custom data configuration default-17f2c3adf84dbe9d
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-17f2c3adf84dbe9d/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 10000/10000 [00:00<00:00, 26491.37it/s]

255
Uandishi wa Habari za Kiraia · Oktoba, 2015
Habari kuhusu Uandishi wa Habari za Kiraia kutoka Oktoba, 2015
Kampeni ya ‘Alaa Aachiwe’ Yashika Kasi Mwaka Mmoja Baada ya Kufungwa Kwake
Alaa Abd El Fattah ametumikia mwaka mmoja kwa sababu ya uanaharakati wake. Amebakisha miaka minne. Watumiaji wa mitandao wanapiga kelele wanapoadhimisha mwaka mmoja wa kifungo...
Filamu ya Difret Inayosimulia Mila ya ‘Kumteka’ Mwanamke Kulazimisha Ndoa Nchini Ethiopia
Neno "Difret" lina maana ya "ujasiri" katika lugha ya ki-Amariki. Ni filamu mpya yenye jina hilo ikisimulia mkasa wa msichana wa ki-Ethiopia aliyetekwa na wanaume...
Wa-Tanzania Wamkumbuka Baba wa Taifa Lao kwa Alama ya #DearNyerere
"#DearNyerere, enzi zako, umaarufu ulitokana na matendo mazuri kwa nchi yako, lakini siku hizi ni idadi ya wafuasi kwneye mitandao ya Instagram na Twita."
Wanaharakati Waomba Ulinzi kwa Makabila Yanayopinga Uchimbaji Madini Nchini Ufilipino
"Wao ndio waasisi wa tamaduni zetu za kipekee za sanaa. Mauji dhidi yao




In [10]:
process_page(ds_gws['train'][1], skip_dict_gws)

{'meta': {'url_host_name': 'sw.globalvoices.org',
  'html_title': ['<title>Uandishi wa Habari za Kiraia · Oktoba, 2015 · Global Voices in Swahili</title>\n'],
  'url': 'https://sw.globalvoices.org/category/topics/citizen-media/?m=201510',
  'content_languages': 'swa',
  'seed_id': 622},
 'text': 'Uandishi wa Habari za Kiraia · Oktoba, 2015\nHabari kuhusu Uandishi wa Habari za Kiraia kutoka Oktoba, 2015\nKampeni ya ‘Alaa Aachiwe’ Yashika Kasi Mwaka Mmoja Baada ya Kufungwa Kwake\nAlaa Abd El Fattah ametumikia mwaka mmoja kwa sababu ya uanaharakati wake. Amebakisha miaka minne. Watumiaji wa mitandao wanapiga kelele wanapoadhimisha mwaka mmoja wa kifungo...\nFilamu ya Difret Inayosimulia Mila ya ‘Kumteka’ Mwanamke Kulazimisha Ndoa Nchini Ethiopia\nNeno "Difret" lina maana ya "ujasiri" katika lugha ya ki-Amariki. Ni filamu mpya yenye jina hilo ikisimulia mkasa wa msichana wa ki-Ethiopia aliyetekwa na wanaume...\nWa-Tanzania Wamkumbuka Baba wa Taifa Lao kwa Alama ya #DearNyerere\n"#DearNyere

In [11]:
f = open("lm_sw_pseudocrawl_globalvoices_swahili.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_gws['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_gws)) + "\n")

f.close()

100%|██████████| 18847/18847 [02:35<00:00, 121.48it/s]


# Make Dataset for 698: BBC Swahili

In [12]:
ds_bbcs, _, skip_dict_bbcs = make_dataset_line_counts(698)

keep, skip = filter_lines(
    ds_bbcs['train'][1]["text"],
    skip_dict_bbcs
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

process_page(ds_bbcs['train'][1], skip_dict_bbcs)

Using custom data configuration default-e563a0985753899d
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-e563a0985753899d/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 5194/5194 [00:00<00:00, 78429.25it/s]

872
Brexit: Uingereza haitatoa viza kwa wafanyakazi wasiokuwa na ujuzi
Haki miliki ya picha PA Media
Wafanyakazi wasiokuwa na ujuzi hawatapata viza chini ya mpango mpya wa uhamiaji wa serikali baada ya Uingereza kujiondoa katika Muungano wa Ulaya.
Serikali inahamasiha waajiri kuacha kutegemea wafanyakazi wa kulipwa mshahara wa chini kutoka Ulaya na kuwekeza zaidi katika kudumisha wafanyakazi na kuendelezea teknolojia.
Wizara ya mambo ya ndani imesema Ulaya na raia wa nchi ambazo sio wanachama wa Muungano huo watakuwa wanachukuliwa walio katika daraja moja baada ya usafiri huru kati ya Uingereza na Muungano wa Ulaya kufikia ukomo wake Desemba 31.
Makampuni yanasema sheria kali kama hizo zitafanya iwe vigumu kuvutia wafanyakazi.
Lakini waziri wa mambo ya ndani Priti Patel amesema utekelezaji wa mfumo mpya kutamaanisha watu wenye weledi tu ndio watakaoingia Uingereza.
Jinsi mambo yatakavyobadilika baada ya Brexit
Wakuzaji wa maua walio na wasiwasi kuhusu Brexit
Makubaliano ya Marekani, Ch




{'meta': {'url_host_name': 'www.bbc.com',
  'html_title': ['<title>Brexit: Uingereza haitatoa viza kwa wafanyakazi wasiokuwa na ujuzi - BBC News Swahili</title>\n'],
  'url': 'https://www.bbc.com/swahili/habari-51555700',
  'content_languages': 'swa',
  'seed_id': 698},
 'text': 'Brexit: Uingereza haitatoa viza kwa wafanyakazi wasiokuwa na ujuzi\nHaki miliki ya picha PA Media\nWafanyakazi wasiokuwa na ujuzi hawatapata viza chini ya mpango mpya wa uhamiaji wa serikali baada ya Uingereza kujiondoa katika Muungano wa Ulaya.\nSerikali inahamasiha waajiri kuacha kutegemea wafanyakazi wa kulipwa mshahara wa chini kutoka Ulaya na kuwekeza zaidi katika kudumisha wafanyakazi na kuendelezea teknolojia.\nWizara ya mambo ya ndani imesema Ulaya na raia wa nchi ambazo sio wanachama wa Muungano huo watakuwa wanachukuliwa walio katika daraja moja baada ya usafiri huru kati ya Uingereza na Muungano wa Ulaya kufikia ukomo wake Desemba 31.\nMakampuni yanasema sheria kali kama hizo zitafanya iwe vigumu ku

In [13]:
f = open("lm_sw_pseudocrawl_bbc_swahili.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_bbcs['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_bbcs)) + "\n")

f.close()

100%|██████████| 5194/5194 [00:16<00:00, 324.50it/s]


# Make Dataset for 699: BBC gahuza (Kinyarwanda and Kirundi)

In [14]:
ds_bbcg, _, skip_dict_bbcg = make_dataset_line_counts(699)

keep, skip = filter_lines(
    ds_bbcg['train'][1]["text"],
    skip_dict_bbcg
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_bbcg['train'][1], skip_dict_bbcg))

f = open("lm_rw_pseudocrawl_bbc_gahuza.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_bbcg['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_bbcg)) + "\n")

f.close()

Using custom data configuration default-b2b8257d45a2cc6a
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-b2b8257d45a2cc6a/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 7222/7222 [00:00<00:00, 101845.73it/s]
  1%|▏         | 93/7222 [00:00<00:16, 434.39it/s]

132
Ishirahamwe riharanira agateka ka zina muntu, Amensty International, ryasavye ishirahamwe ry'akarere k'uburengero bwa Afrika, ECOWAS, kwirukana Gambia ibintu nibitahinduka.
Amnesty ivuga ko leta ya Gambia yakoze ico yita ihohoterwa ry'abanyepolitike batavuga rumwe na leta imbere y'amatora yo mu kwezi kwa cumi na kabiri.
Ngo abashigikiye abatavuga rumwe na leta barenga mirongo itanu barindiriye gucibwa urubanza, nayo abashika mirongo itatu na batandatu bapfunzwe ata rubanza baracibwa.
Umwe mu banyepolitike batavuga rumwe na leta yaguye mu munyororo nyuma yo gukubagurwa.
Mu kwezi guheze umukuru w'igihugu, Yahya Jammeh, yavuze ko atazotegeka ko haba itohoza, avuga ko ari ibisanzwe ko impfungwa zipfira mu munyororo.
Young yitwariye Trump kubera akoresha umuziki wiwe atabimusavye
Iminota 45 iraheze
Iminota 54 iraheze
Liban mu kigandaro nyuma y'isanganya ry'umuriro ryatewe n'ibiturika
Inkuru y'umukinnyi wo muri Algeria utarigeze ahabwa ikarita na rimwe
Insiguro y'isanamu,
Beirut: Kuki 'n

100%|██████████| 7222/7222 [00:16<00:00, 426.19it/s]


# Make Dataset for 700: BBC Igbo

In [15]:
ds_bbci, _, skip_dict_bbci = make_dataset_line_counts(700)

keep, skip = filter_lines(
    ds_bbci['train'][1]["text"],
    skip_dict_bbci
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_bbci['train'][1], skip_dict_bbci))

f = open("lm_ig_pseudocrawl_bbc_igbo.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_bbci['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_bbci)) + "\n")

f.close()

Using custom data configuration default-c00146a6b107e236
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-c00146a6b107e236/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 1473/1473 [00:00<00:00, 62917.12it/s]
  2%|▏         | 33/1473 [00:00<00:04, 326.22it/s]

251
Ule Jamb ejidela ọtụtụ ndị mmadụ rue ọtụtụ afọ tupu ha abanye mahadum
Obi mapụwakwa gị makana ngalaba ahụ na-ahụ maka ụle eji abanye mahadum sị na ha ga-amalite nyocha ga-esi 2009 ruo 2019.
Onye is mgbasa ozi Jamb bụ Fabian Benjamin sị Onye asambodo Jamb ya bụ adịbogorja ga-agbara aka.
Jamb: Ohere ọ dịri mmadụ 200,000 na mahadum?
UTME 2018: Ule Jamb ebidola
Gombe ebe onyeisi ndi uweojii bụ Mohammed Adamu si na ha ga-eme nyocha banyere etu Ihe karịrị mmadụ iri na-enwe anụrị ncheta mbilite n'ọnwụ Jesu jiri nwụ mgbe ụgbọala Onye uweoji nya bara n'etiti ha.
Akụgbụru onye uweojii ahụ na onye Civil Defence o bu mgbe ndị mmadụ wakporo ha ka ihe mgberede a mechara.
Ọkaikpe ọzọ esepụla aka n'ihe banyere ikpe asambodo Rochas Okorocha na-arịọ ụlọikpe ka ọ manye Inec inye ya.
Ọkaikpe Okon Abang sị na ọ bụ akwụkwọ Okorocha degara onyeisi ya mere ya ji sepu aka.
Chelsea asorula na ndị na-aracha piọm na Premier League dịka ọkpụ abụọ ọkpụ ha na Burnley gbara ụnyaahụ nyere ha ohere ịrịgọ bụru ndi n

100%|██████████| 1473/1473 [00:04<00:00, 303.60it/s]


# Make Dataset for 701: BBC Yoruba

In [16]:
ds_bbcy, _, skip_dict_bbcy = make_dataset_line_counts(701)

keep, skip = filter_lines(
    ds_bbcy['train'][1]["text"],
    skip_dict_bbcy
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_bbcy['train'][1], skip_dict_bbcy))

f = open("lm_yo_pseudocrawl_bbc_yoruba.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_bbcy['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_bbcy)) + "\n")

f.close()

Using custom data configuration default-b6cdd17753a8c35c
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-b6cdd17753a8c35c/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 2614/2614 [00:00<00:00, 56232.39it/s]
  2%|▏         | 60/2614 [00:00<00:08, 294.67it/s]

468
Oyo Land Revocation: Ajimobi gbé Makinde lọ ilé ẹjọ́ lórí ilẹ̀ rẹ̀ tí ìjọba gbẹ́sẹ̀ lé
Gomina ana nipinlẹ Oyo, Sẹnatọ Abiola Ajimobi ti gbe Gomina Seyi Makinde ati awọn eeyan mii ninu ijọba rẹ lọ ile ẹjọ, lẹyin ti wọn gbẹsẹle ilẹ rẹ nilu Ibadan.
Ṣaaju ni iroyin kan jade pe, ijọba ipinlẹ Oyo gbẹsẹ le awọn ilẹ kan lagbegbe Agodi ati Jericho, nilu Ibadan nitori awọn idi kan.
Ta ni wọn yóò búra fún bíi gómínà Bayelsa lónìí, olùdíje PDP ni àbí olórí ilé aṣòfin?
Ẹ̀ṣọ́ Amotekun ní láti ṣiṣẹ́ pọ̀ pẹ̀lú ọlọ́pàá agbègbè- Ọ̀gá ọlọ́pàá Naijiria
Gbọingbọin lọmọ Nàìjíríà wà lẹ́yìn iléẹjọ́ gíga jùlọ lórí ìdájọ́ ìbò Bayelsa- Atiku
Àwọn ọmọ onílẹ̀ lọ ṣe àmúṣẹ àṣẹ iléẹjọ́ ní Soka, làwọn ‘Jàǹdùkú’ fi dá wọn lọ̀nà - Iléeṣẹ́ ọlọ́pàá Ọyọ
Bàbá àti ìyá mi kò fẹ́ kí n ṣe eré tíátà - Wale Akorede Okunnu
Ẹ̀yin òṣèrè tíátà, ẹ ní ṣùúrù de àsìkò Ọlọ́run, ẹ máṣe kánjú kọjá kádàrá - Madam Ṣajẹ
Ọjọ kẹwa oṣu keji ọdun 2020 ni aṣẹ jade pe, ki wọn gbẹsẹ le awọn ilẹ ọhun nitori pe lilo rẹ lodi si anfani awọn ara ilu a

100%|██████████| 2614/2614 [00:08<00:00, 294.36it/s]


# Make Dataset for 702: Global Voices Yoruba

In [17]:
ds_gvy, _, skip_dict_gvy = make_dataset_line_counts(702)

keep, skip = filter_lines(
    ds_gvy['train'][1]["text"],
    skip_dict_gvy
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_gvy['train'][1], skip_dict_gvy))

f = open("lm_yo_pseudocrawl_globalvoices_yoruba.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_gvy['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_gvy)) + "\n")

f.close()

Using custom data configuration default-a1c957ba40f9f92e
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-a1c957ba40f9f92e/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 1184/1184 [00:00<00:00, 35405.57it/s]
  2%|▏         | 24/1184 [00:00<00:04, 239.35it/s]

33
Ẹrẹ́nà , 2019
Ìròyìn láti Ẹrẹ́nà , 2019
Tweets by @DigiAfricanLang Bẹ̀rẹ̀ láti ogúnjọ́ oṣù Ẹrẹ́nà títí di òpin ọdún-un 2019, àwọn ajàfẹ́tọ̀ọ́ èdè ilẹ̀ Adúláwọ̀ ní orí ẹ̀rọ ayárabíàṣá á máa ṣe...
Fò sí àkóónú pàtàkì gangan
Ẹrẹ́nà 2021 1 àtẹ̀jáde
Ọ̀wàrà 2020 1 àtẹ̀jáde
Ọ̀wẹwẹ̀ 2020 1 àtẹ̀jáde
Ògún 2020 3 àwọn àtẹ̀jáde
Òkúdù 2020 1 àtẹ̀jáde
Èbìbì 2020 2 àwọn àtẹ̀jáde
Igbe 2020 6 àwọn àtẹ̀jáde
Ẹrẹ́nà 2020 4 àwọn àtẹ̀jáde
Èrèlé 2020 1 àtẹ̀jáde
Ṣẹẹrẹ 2020 5 àwọn àtẹ̀jáde
Ọ̀pẹ 2019 1 àtẹ̀jáde
Bẹlu 2019 1 àtẹ̀jáde
Ọ̀wàrà 2019 4 àwọn àtẹ̀jáde
Ọ̀wẹwẹ̀ 2019 2 àwọn àtẹ̀jáde
Ògún 2019 3 àwọn àtẹ̀jáde
Agẹmọ 2019 1 àtẹ̀jáde
Òkúdù 2019 3 àwọn àtẹ̀jáde
Èbìbì 2019 4 àwọn àtẹ̀jáde
Igbe 2019 2 àwọn àtẹ̀jáde
Ẹrẹ́nà 2019 4 àwọn àtẹ̀jáde
Èrèlé 2019 11 àwọn àtẹ̀jáde
Ṣẹẹrẹ 2019 7 àwọn àtẹ̀jáde
Ọ̀pẹ 2018 3 àwọn àtẹ̀jáde
Fún ìgbà àkọ́kọ́ ní orílẹ̀-èdèe Brazil, ọmọ ìbílẹ̀ lóbìnrin di ọmọ ìgbìmọ̀ ìjọba
Joenia ni ọmọ ìbílẹ̀ lóbìnrin àkọ́kọ́ tí ó kẹ́kọ̀ọ́ gboyè nínú ẹ̀kọ́ ìmọ̀ òfin ní Brazil, òun sì ni agbẹjọ́rò 

100%|██████████| 1184/1184 [00:05<00:00, 208.39it/s]


# Make Dataset for 703: Global Voices Igbo

In [18]:
ds_gvi, _, skip_dict_gvi = make_dataset_line_counts(703)

keep, skip = filter_lines(
    ds_gvi['train'][1]["text"],
    skip_dict_gvi
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_gvi['train'][1], skip_dict_gvi))

f = open("lm_ig_pseudocrawl_globalvoices_igbo.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_gvi['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_gvi)) + "\n")

f.close()

Using custom data configuration default-18857a5af7ec6eff
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-18857a5af7ec6eff/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 328/328 [00:00<00:00, 66492.59it/s]
 10%|▉         | 32/328 [00:00<00:00, 317.79it/s]

2
Stories Ahụike
Skip to main content
Ahụike
Agbamụmbọ dịgitalụ
Agha na ọgbaaghara
Agụmaakwụkwọ
Ahụike
Arogasị
Asụsụ
Inwe ikikere nke ndị mmadụ
Iwu
Mbata na ọpụpụ
Mgbasaozi ndị Amaala
Mmekọrịta mba ụwa niile
Ndị Ntorobịa na agbọghọbịa
Ngagharị iwe
Ọchịchị
Politiksi
Ụmụ nwaanyị na genda
Naijeria emela mpụ idina mmadu n'ike nsogbu a ga elebara anya ngwa-ngwa.
Na Naijeria, Idina mmadu n'ike na mpụ ndi ọzọ aghọla omenala, nkea mere ojiri bụrụ ihe sịrị ike ikpọpu ndi omekome a ikpe igbara isi...
Foto Igbo Wịkipidiya Nke Izu a [Ndi Arọ]
Ndi Arọ bụ mpaghara ndi Igbo ama-ama, otu njirimara ha na gboo bụ igba ahia ohu.
Menu
About Us
Contact
Contributors
Donate
Lingua
What Is Global Voices?
Global Voices Manifesto
Mmekorita
Sponsors
Global Voices is supported by the efforts of our volunteer contributors, foundations, donors and mission-related services. For more information please read our Fundraising Ethics Policy.
Donate »
This site is licensed as Creative Commons Attribution 3.0 Some Rights R

100%|██████████| 328/328 [00:00<00:00, 335.30it/s]


# Make Dataset for 704: Deutsche Welle Swahili

In [19]:
ds_dws, _, skip_dict_dws = make_dataset_line_counts(704)

keep, skip = filter_lines(
    ds_dws['train'][1]["text"],
    skip_dict_dws
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_dws['train'][1], skip_dict_dws))

f = open("lm_sw_pseudocrawl_deutschewelle_swahili.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_dws['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_dws)) + "\n")

f.close()

Using custom data configuration default-9b95cb8fa1ad6bdb
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-9b95cb8fa1ad6bdb/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 5898/5898 [00:00<00:00, 22732.46it/s]
  0%|          | 14/5898 [00:00<00:43, 136.68it/s]

706
Juhudi za kuunusuru Umoja wa Ulaya Magazetini
Juhudi za kuunusuru Umoja wa Ulaya usidhibitiwe na vyama vya siasa kali za mrengo wa kulia na uamuzi wa kocha wa timu ya taifa ya Ujerumani Joachim Löw kuwafuta vigogo watatu wa timu bingwa ya Bayern Munich Magazetini
Tunaanza na juhudi za kuuokoa Umoja wa ulaya. Gazeti la "Mannheimer Morgen "linaandika: "Katika mataifa wanachama wa umoja wa Ulaya zimeibuka hisia za kuunga mkono Umoja wa ulaya, lakini hisia hizo hazitoshi ikiwa vyama vya siasa kali za mrengo wa kulia na vile vinavyopigania uzalendo ndivyo vinavyowahimiza wafuasi wao. Wapiga kura wanaweza kujifunza kwa kuchukua mifano ya Uingereza na Marekani na kutambua kupinga kitu pekee haitoshi, watu wanabidi wapaze sauti ikilazimika na kupania kwaajili ya kile wanachokipigania.
Rais wa Ufaransa Emmanuel Macron ameonyesha mfano, linaandika gazeti la mji mkuu "Berliner Morgenpost": "Rais wa Ufaransa ameifungua kwa kishindo kampeni ya uchaguzi wa Ulaya. Anajitambulisha kuwa kiongozi wa

100%|██████████| 5898/5898 [00:43<00:00, 135.96it/s]


# Make Dataset for 705: Mwananchi (Swahili)

In [20]:
ds_mwa, _, skip_dict_mwa = make_dataset_line_counts(705)

keep, skip = filter_lines(
    ds_mwa['train'][1]["text"],
    skip_dict_mwa
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_mwa['train'][1], skip_dict_mwa))

f = open("lm_sw_pseudocrawl_mwananchi.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_mwa['train']):
    _ = f.write(json.dumps(process_page(article, skip_dict_mwa)) + "\n")

f.close()

Using custom data configuration default-47e9c04d6d7126a4
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-47e9c04d6d7126a4/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 6662/6662 [00:00<00:00, 56795.10it/s]
  0%|          | 28/6662 [00:00<00:23, 276.94it/s]

572
Umewahi kujiuliza nafasi ya Tanzania ingekuwa wapi katika historia ya ukombozi wa bara la Afrika bila Mwalimu Julius Nyerere?
Je, akina Nelson Mandela, Thabo Mbeki, Jacob Zuma na wengine wangemiliki hati za kusafiria za Tanzania kwa nyakati tofauti ili kupata urahisi wa kusafiri nchi mbalimbali kusaka kuungwa mkono harakati zao kupata ukombozi dhidi ya utawala wa kibaguzi Afrika Kusini?
Mwaka 1962, kama Tanzania isingekuwa na Mwalimu Nyerere, kiongozi wa upinzani, Kenya, Raila Odinga, angepata wapi nafasi ya kwenda Ujerumani masomoni? Baada ya kunyimwa fursa ya kusafiri na Waingereza, Raila alikimbilia Tanganyika, Nyerere alimuwezesha. Mwalimu alikuwa kimbilio la kila Mwafrika aliyeonewa na wakoloni.
Wapigania uhuru wa Msumbiji, wangesafiri vipi duniani kwa kutumia hati za kusafiria za Tanzania, kama nchi isingekuwa na Mwalimu? Suala si kutoa hati, bali uthubutu wa kufanya hivyo. Mwaimu aliwathibitishia wakoloni kuwa hakuwa mwoga. Alipenda uhuru wa Waafrika kuliko hofu ya vitisho v

100%|██████████| 6662/6662 [00:21<00:00, 313.14it/s]


# Make Dataset for 706: Voice of America Swahili

In [21]:
ds_voas, _, skip_dict_voas = make_dataset_line_counts(706)

keep, skip = filter_lines(
    ds_voas['train'][1]["text"],
    skip_dict_voas
)
print(len(keep.split()))
print(keep)
print("==========")
print("==========")
print(skip)

pprint(process_page(ds_voas['train'][1], skip_dict_voas))

f = open("lm_sw_pseudocrawl_voiceofamerica_swahili.jsonl", "w", encoding = "utf-8")

for article in tqdm(ds_voas['train']):
    processed_dict = process_page(article, skip_dict_voas)
    if len(processed_dict["text"]) > 100:
        _ = f.write(json.dumps(processed_dict) + "\n")

f.close()

Using custom data configuration default-649b950778c27b7e
Reusing dataset json (/home/yjernite/.cache/huggingface/datasets/json/default-649b950778c27b7e/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




100%|██████████| 10000/10000 [00:00<00:00, 78057.67it/s]
  0%|          | 62/104752 [00:00<05:44, 304.02it/s]

0

Upatikanaji viungo
Video
Habari
Matangazo Yetu
Alfajiri 0300 UTC
VOA Express 1330 UTC
Jioni 1630 UTC
Kwa Undani 1800 UTC
Makala Maalum
Login / Register
Zaidi
VOA Swahili Audio Tube
Kipindi Kipya
Matangazo Yetu
Radio Zaidi
Alfajiri 0300 UTC
Jioni 1330 UTC
VOA Express 1630 UTC
Kwa Undani 1800 UTC
Embed
Shirikiana kwenye Facebook
Shirikiana kwenye Twitter
Matangazo ya nusu saa kuhusu habari za mapema asubuhi pamoja na habari za michezo.
Pleya
Shirikisha
Makundi
Afya
Zaidi
Hali ya HIV/Ukimwi Afrika
Zaidi
Makala Maalum
Zaidi
Uchaguzi Kenya 2013
Kesi za Wakenya Mbele ya ICC
Zaidi
Fainali za Kombe la Afrika 2015
CAN Mashabiki wa Jamhuri ya Kidemokrasia ya Kongo washerehekea ushindi wa timu yao
Zaidi
Ziara ya Rais Obama Kenya, Julai 23-26
Wafanyabiashara wadogo wanatumaini ziara ya Obama kuinua biashara zao
Sehemu ya pili: Marekani inasimamia kuonyesha uvumilivu kwa jamii za mashoga
Zaidi
Mkutano wa baraza kuu la Umoja wa Mataifa
Waziri wa mambo ya nje wa Tanzania akiwasilisha hati zake kat

100%|██████████| 104752/104752 [07:14<00:00, 240.94it/s]
