In [3]:
import json
import pysbd
from tqdm import tqdm
from utils import load_dataset

In [None]:
segmenter = pysbd.Segmenter(language="en", clean=False)

def merge_short_sentences(sentences, threshold=60):
    """
    Returns a list of sentences where sentences below the threshold length
    are re-concatenated with the following sentence. If the result is still 
    below the threshold length, the process is repeated until the threshold
    is reached.
    """
    merged_sentences = []
    for i in range(len(sentences) - 1):
        if len(sentences[i]) < threshold:
            sentences[i + 1] = sentences[i] + sentences[i + 1]
        else:
            merged_sentences.append(sentences[i])

    # Handle the last sentence
    if len(sentences[-1]) < threshold or len(merged_sentences[-1]) < threshold:
        merged_sentences[-1] = merged_sentences[-1] + sentences[-1]
    else:
        merged_sentences.append(sentences[-1])
    return merged_sentences

In [None]:
# Load a dataset
for dataset in ['Astro_Reviews.json', 'Earth_Science_Reviews.json', 'Planetary_Reviews.json']:
    print(f"Processing {dataset}...")
    records = load_dataset('data/json/' + dataset)

    with open('data/sentence_segmented/' + dataset, 'w') as f:
        f.write('[')  # Start the JSON array
        for i, record in enumerate(tqdm(records)):
            sentences = segmenter.segment(record['body'])
            record['body_sentences'] = sentences
            f.write(json.dumps(record))
            if i < len(records) - 1:
                f.write(',')  # Add a comma between JSON objects
        f.write(']')  # End the JSON array
    

Processing Astro_Reviews.json...
data/json/Astro_Reviews.json: 996/1000 have all required keys


100%|██████████| 996/996 [1:07:07<00:00,  4.04s/it]


Processing Earth_Science_Reviews.json...
data/json/Earth_Science_Reviews.json: 994/1000 have all required keys


 95%|█████████▌| 945/994 [2:05:38<06:30,  7.98s/it]  


KeyboardInterrupt: 

#### Multiprocessor version

In [None]:
import json
from tqdm import tqdm
import os
import pysbd
from utils import load_dataset
from concurrent.futures import ProcessPoolExecutor, as_completed

segmenter = pysbd.Segmenter(language="en", clean=False)


def merge_short_sentences(sentences, threshold=60):
    """
    Returns a list of sentences where sentences below the threshold length
    are re-concatenated with the following sentence. If the result is still 
    below the threshold length, the process is repeated until the threshold
    is reached.
    """
    merged_sentences = []
    for i in range(len(sentences) - 1):
        if len(sentences[i]) < threshold:
            sentences[i + 1] = sentences[i] + sentences[i + 1]
        else:
            merged_sentences.append(sentences[i])

    # Handle the last sentence
    if len(sentences[-1]) < threshold or len(merged_sentences[-1]) < threshold:
        merged_sentences[-1] = merged_sentences[-1] + sentences[-1]
    else:
        merged_sentences.append(sentences[-1])
    return merged_sentences

def process_record(record):
    # record['body_sentences'] = segmenter.segment(record['body'])
    # return record
    sentences = segmenter.segment(record['body'])
    record['body_sentences'] = merge_short_sentences(sentences)
    return record


# Load a dataset
for dataset in ['Astro_Reviews.json', 'Earth_Science_Reviews.json', 'Planetary_Reviews.json']:
    print(f"Processing {dataset}...")
    records = load_dataset('data/json/' + dataset)

    # with open('data/sentence_segmented/' + dataset, 'w') as f:
    with open('data/testing.json', 'w') as f:
        f.write('[')  # Start the JSON array
        with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
            futures = [executor.submit(process_record, record)
                       for record in records]
            results = []
            for future in tqdm(as_completed(futures), total=len(futures)):
                results.append(future.result())
            for i, record in enumerate(results):
                f.write(json.dumps(record))
                if i < len(records) - 1:
                    f.write(',')  # Add a comma between JSON objects
        f.write(']')  # End the JSON array

Processing Astro_Reviews.json...
data/json/Astro_Reviews.json: 996/1000 have all required keys


Process SpawnProcess-1:
Traceback (most recent call last):
  File "/Users/benjaminbasseri/miniforge3/envs/citeline/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/benjaminbasseri/miniforge3/envs/citeline/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benjaminbasseri/miniforge3/envs/citeline/lib/python3.11/concurrent/futures/process.py", line 249, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/benjaminbasseri/miniforge3/envs/citeline/lib/python3.11/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_record' on <module '__main__' (built-in)>
Process SpawnProcess-2:
Traceback (most recent call last):


OSError: handle is closed

In [None]:
for dataset in ['Astro_Reviews.json', 'Earth_Science_Reviews.json', 'Planetary_Reviews.json']:
    for record in load_dataset('data/sentence_segmented/' + dataset):
        record['body_sentences'] = merge_short_sentences(record['body_sentences'])
    with open('data/sentence_segmented_and_merged/' + dataset, 'w') as f:
        f.write('[')
        for i, record in enumerate(records):
            f.write(json.dumps(record))
            if i < len(records) - 1:
                f.write(',')
        f.write(']')
        
    print(f"Dataset {dataset} has been checked.")

12