In [None]:
import json                 # JSON module (not used directly, but kept for clarity)
import os                   # OS utilities for file paths and directories

input_file = "/Users/deepdata/Downloads/Yelp JSON/yelp_dataset/yelp_academic_dataset_review.json"  
# Absolute path to the large Yelp JSON file (one JSON object per line)

output_prefix = "split_file_"  
# Prefix used for naming output split files (e.g., split_file_1.json)

num_files = 10  
# Number of smaller files the large JSON file will be split into

output_dir = "/Users/deepdata/Downloads/yelp_splits"  
# Directory where the split files will be saved (must be writable)

os.makedirs(output_dir, exist_ok=True)  
# Create the output directory if it does not already exist

with open(input_file, "r", encoding="utf8") as f:  
    total_lines = sum(1 for _ in f)  
    # Count total number of lines (each line is one JSON record)

lines_per_file = total_lines // num_files  
# Calculate how many lines should go into each split file

print(f"Total lines: {total_lines}, Lines per file: {lines_per_file}")  
# Display line count information for verification

with open(input_file, "r", encoding="utf8") as f:  
    # Re-open the input file to start reading from the beginning

    for i in range(num_files):  
        # Loop to create each split file

        output_filename = os.path.join(
            output_dir, f"{output_prefix}{i+1}.json"
        )  
        # Construct full output file path for the current split

        with open(output_filename, "w", encoding="utf8") as out_file:  
            # Open the output file in write mode

            for j in range(lines_per_file):  
                # Write the calculated number of lines to this split file

                line = f.readline()  
                # Read one line from the input file

                if not line:  
                    break  
                    # Stop writing if end of input file is reached

                out_file.write(line)  
                # Write the line to the current output file

print("✅ JSON file successfully split into smaller parts!")  
# Final success message after splitting is complete


Total lines: 6990280, Lines per file: 699028
✅ JSON file successfully split into smaller parts!
