In [1]:
import json

import dask.dataframe as dd
import dask.bag as db

from dask.distributed import Client, progress

In [2]:
client = Client(threads_per_worker=1,
                n_workers=4,
                memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:43499  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.00 GB


### Load files

Load the extracted wikipedia JSON (https://github.com/attardi/wikiextractor)

In [3]:
import os

dirs = ['text/AA', 'text/AB', 'text/AC']

file_paths = []

for dir_ in dirs:
    file_paths.extend(map(lambda p: os.path.join(dir_, p), os.listdir(dir_)))

'Total {} files'.format(len(file_paths))

'Total 278 files'

### Setup Func for Splitting lines

`split_lines` takes a `dict` and processes it for the Prodigy JSON format.

In [4]:
import re

PATTERN = re.compile(r'\s*[\.\!\?।]\s*')

def split_lines(document):
    meta = {'source': '{} - {}'.format(document['title'], document['url'])}
    ret = []
    for line in PATTERN.split(document['text']):
        ret.append({'text': line, 'meta': meta})
    return ret
    

In [5]:
documents = db.read_text(file_paths) \
    .map(json.loads) \
    .map(split_lines) \
    .map(json.dumps)

In [6]:
documents.to_textfiles('data/*.jsonl')

['/home/aniruddha/Projects/bangla_model/data/000.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/001.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/002.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/003.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/004.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/005.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/006.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/007.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/008.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/009.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/010.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/011.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/012.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/013.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/014.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/015.jsonl',
 '/home/aniruddha/Projects/bangla_model/data/016.jsonl',
 '/home/aniruddha/Projects/bang