In [26]:
import bz2
import json
import os
import re
import tarfile

In [28]:
def clean_text(text):
    """Remove HTML tags and replace escaped quotes with actual quotes."""
    text = re.sub('<.*?>', '', text)
    text = text.replace('\\"', '')
    return text

In [29]:
def process_json_object(json_obj):
    """Combine title and text, and clean the text."""
    combined_text = " ".join([" ".join(paragraph) for paragraph in json_obj['text']])
    cleaned_combined_text = clean_text(combined_text)
    return {
        "id": json_obj['id'],
        "title": json_obj['title'],
        "combine_text": json_obj['title'] + " " + cleaned_combined_text
    }

In [40]:
def process_bz2_files_in_subfolder(subfolder_path, combined_output_path):
    """Process all .bz2 files in a given sub-folder and combine their output."""
    combined_output = []
    for filename in os.listdir(subfolder_path):
        if filename.endswith('.bz2'):
            bz2_file_path = os.path.join(subfolder_path, filename)
            with bz2.open(bz2_file_path, 'rt') as file:
                for line in file:
                    json_obj = json.loads(line)
                    processed_json_obj = process_json_object(json_obj)
                    combined_output.append(processed_json_obj)
    # Ensure the directory exists before writing
    os.makedirs(os.path.dirname(combined_output_path), exist_ok=True)
    # Write combined output to a single file
    with open(combined_output_path, 'w', encoding='utf-8') as output_file:
        json.dump(combined_output, output_file, ensure_ascii=False, indent=4)

In [42]:
def extract_and_process_tar_bz2(input_tar_bz2_path, output_base_directory):
    """Extract a .tar.bz2 archive and process its contents, combining output into single JSON files."""
    output_directory = os.path.join(output_base_directory, 'processed_wiki_json')
    # Moved os.makedirs here to ensure the directory is created before it's needed
    os.makedirs(output_directory, exist_ok=True)
    
    with tarfile.open(input_tar_bz2_path, "r:bz2") as tar:
        tar.extractall(path=output_base_directory)
        for member in tar.getmembers():
            if member.isdir():
                subfolder_name = os.path.basename(member.name)
                subfolder_path = os.path.join(output_base_directory, member.name)
                combined_output_path = os.path.join(output_directory, f"{subfolder_name}_combined.json")
                process_bz2_files_in_subfolder(subfolder_path, combined_output_path)

In [43]:


input_tar_bz2_path = 'enwiki-20171001-pages-meta-current-withlinks-processed.tar.bz2'
output_base_directory = 'Preprocessed_json_objects'

extract_and_process_tar_bz2(input_tar_bz2_path, output_base_directory)
