In [6]:
"""
This script provides utilities for processing large JSONL files:
1. split_jsonl(): Splits a large JSONL file into smaller chunks with specified size
2. slide_window(): Implements sliding window approach to split long text content while maintaining context
3. process_jsonl(): Processes JSONL files by splitting long content using sliding window

The main purpose is to handle large text data and prepare it for further processing or model training
by ensuring each chunk stays within token limits while preserving context through overlapping.
"""

import json
import nltk

# Ensure nltk punkt tokenizer is downloaded
nltk.download('punkt')

def split_jsonl(input_file, output_prefix, chunk_size):
    with open(input_file, 'r', encoding='utf-8') as in_file:
        data = []
        file_number = 1
        for line in in_file:
            data.append(json.loads(line))
            if len(data) == chunk_size:
                with open(f'{output_prefix}_{file_number}.jsonl', 'w', encoding='utf-8') as out_file:
                    for item in data:
                        out_file.write(json.dumps(item, ensure_ascii=False) + '\n')
                data = []
                file_number += 1
        if data:
            with open(f'{output_prefix}_{file_number}.jsonl', 'w', encoding='utf-8') as out_file:
                for item in data:
                    out_file.write(json.dumps(item, ensure_ascii=False) + '\n')

def slide_window(content, max_tokens=7000, overlap_tokens=200):
    # Split content into sentences using nltk tokenizer
    sentences = nltk.tokenize.sent_tokenize(content)
    current_chunk = []
    chunks = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(nltk.tokenize.word_tokenize(sentence))
        if current_length + sentence_length > max_tokens and current_chunk:
            # Save current chunk when max token count is reached
            chunks.append(' '.join(current_chunk))
            # Backtrack to find overlap section containing ~200-300 tokens
            overlap_text = ''
            overlap_length = 0
            for sent in reversed(current_chunk):
                sent_length = len(nltk.tokenize.word_tokenize(sent))
                if overlap_length + sent_length > overlap_tokens:
                    break
                overlap_text = sent + ' ' + overlap_text
                overlap_length += sent_length
            # Keep overlap section as start of next chunk
            current_chunk = nltk.tokenize.sent_tokenize(overlap_text)
            current_length = overlap_length
        current_chunk.append(sentence)
        current_length += sentence_length

    # Save the final chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def process_jsonl(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            data = json.loads(line)
            content = data['content']
            processed_contents = slide_window(content)
            for content_chunk in processed_contents:
                json_record = json.dumps({'content': content_chunk}, ensure_ascii=False)
                outfile.write(json_record + '\n')

# Example usage
input_file = './data/input.jsonl'  # Input JSONL file path
output_prefix = './data/output'  # Output prefix for split files
chunk_size = 200
split_jsonl(input_file, output_prefix, chunk_size)

# Process file with sliding window
input_jsonl = './data/source.jsonl'
output_jsonl = './data/processed.jsonl'
process_jsonl(input_jsonl, output_jsonl)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gatsby\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gatsby\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
