In [1]:
import json
import pysbd
from tqdm import tqdm
from utils import load_dataset

In [None]:
segmenter = pysbd.Segmenter(language="en", clean=False)
# Load a dataset
for dataset in ['Astro_Reviews.json', 'Earth_Science_Reviews.json', 'Planetary_Reviews.json']:
    print(f"Processing {dataset}...")
    records = load_dataset('data/json/' + dataset)

    with open('data/sentence_segmented/' + dataset, 'w') as f:
        f.write('[')  # Start the JSON array
        for i, record in enumerate(tqdm(records)):
            sentences = segmenter.segment(record['body'])
            record['body_sentences'] = sentences
            f.write(json.dumps(record))
            if i < len(records) - 1:
                f.write(',')  # Add a comma between JSON objects
        f.write(']')  # End the JSON array
    

Processing Astro_Reviews.json...
data/json/Astro_Reviews.json: 996/1000 have all required keys


100%|██████████| 996/996 [1:07:07<00:00,  4.04s/it]


Processing Earth_Science_Reviews.json...
data/json/Earth_Science_Reviews.json: 994/1000 have all required keys


 95%|█████████▌| 945/994 [2:05:38<06:30,  7.98s/it]  


KeyboardInterrupt: 

#### Multiprocessor version

In [None]:
import json
from tqdm import tqdm
import os
import pysbd
from utils import load_dataset
from concurrent.futures import ProcessPoolExecutor, as_completed

segmenter = pysbd.Segmenter(language="en", clean=False)


def process_record(record):
    record['body_sentences'] = segmenter.segment(record['body'])
    return record


# Load a dataset
for dataset in ['Astro_Reviews.json', 'Earth_Science_Reviews.json', 'Planetary_Reviews.json']:
    print(f"Processing {dataset}...")
    records = load_dataset('data/json/' + dataset)

    with open('data/sentence_segmented/' + dataset, 'w') as f:
        f.write('[')  # Start the JSON array
        with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
            futures = [executor.submit(process_record, record)
                       for record in records]
            results = []
            for future in tqdm(as_completed(futures), total=len(futures)):
                results.append(future.result())
            for i, record in enumerate(results):
                f.write(json.dumps(record))
                if i < len(records) - 1:
                    f.write(',')  # Add a comma between JSON objects
        f.write(']')  # End the JSON array