In [31]:
import bz2
import json
import re

In [32]:
def clean_text(text):
    """Remove HTML tags and replace escaped quotes with actual quotes."""
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Replace escaped quotes with actual quotes
    text = text.replace('\"', '')
    return text

def preprocess_and_save_json_objects(input_file_path, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        with bz2.open(input_file_path, "rt", encoding='utf-8') as file:
            for line in file:
                json_obj = json.loads(line)
                
                # Combine all text in the 'text' attribute
                combined_text = " ".join([" ".join(paragraph) for paragraph in json_obj['text']])
                
                # Clean the combined text
                cleaned_combined_text = clean_text(combined_text)
                
                # Concatenate the title with the cleaned combined text
                combined_text_with_title = json_obj['title'] + " " + cleaned_combined_text
                
                # Create a new JSON object with only id, title, and combine_text
                new_json_obj = {
                    "id": json_obj['id'],
                    "title": json_obj['title'],
                    "combine_text": combined_text_with_title
                }
                
                # Write the new JSON object to the output file
                json.dump(new_json_obj, output_file, ensure_ascii=False)
                output_file.write('\n')

In [33]:
# Input file path
input_file_path = 'wiki_00.bz2'
# Output file path
output_file_path = 'Processed_wiki.json'

# Preprocess the dataset and save the results
preprocess_and_save_json_objects(input_file_path, output_file_path)

print(f"Processed JSON objects have been saved to {output_file_path}")


Processed JSON objects have been saved to Processed_wiki.json
