# START HERE: Make development file
This notebook takes the first `RAW_SAMPLE_SIZE` samples out of the 4gb data lake (json file)

In [None]:
import ijson
import json
import os

## Configuration

In [None]:
MAX_SIZE = 55255 # this is every post in the dataset, don't change this

In [None]:
RAW_SAMPLE_SIZE = 25000
if RAW_SAMPLE_SIZE > MAX_SIZE:
    raise ValueError(f"RAW_SAMPLE_SIZE cannot be greater than {MAX_SIZE}")
RAW_DATA_PATH = "json/local/posts-11-13-2024.json"
RAW_DATA_OUTPUT_PATH = f"json/local/raw_data_DEVELOPMENT_{RAW_SAMPLE_SIZE}.json"

## Make development input file
pull out the first `RAW_SAMPLE_SIZE` samples unedited

In [None]:
data = []
with open(RAW_DATA_PATH, "rb") as f:  # Note: ijson needs binary mode
    parser = ijson.items(f, "item")  # Assumes JSON is an array of objects
    for i, item in enumerate(parser):
        if i >= RAW_SAMPLE_SIZE:  # Only get first RAW_SAMPLE_SIZE items
            break
        data.append(item)

# print the first 10 items to verify
print(data[:10])

#### Convert Decimal to float
Raw data contains Decimal fields, which are not serializable by default

In [None]:
from decimal import Decimal

class DecimalEncoder(json.JSONEncoder):
  def default(self, obj):
    if isinstance(obj, Decimal):
      return str(obj)
    return json.JSONEncoder.default(self, obj)

In [None]:
def get_unique_filename(file_path):
    """
    Appends a number to the file name if the file already exists.
    I always overwrite the file by accident so this is a safety measure lol
    """
    base, ext = os.path.splitext(file_path)
    counter = 1
    new_file_path = file_path
    while os.path.exists(new_file_path):
        new_file_path = f"{base}({counter}){ext}"
        counter += 1
    return new_file_path

#### Write the samples to a file for easier development

In [None]:
os.makedirs(os.path.dirname(RAW_DATA_OUTPUT_PATH), exist_ok=True)

unique_output_path = get_unique_filename(RAW_DATA_OUTPUT_PATH)
with open(unique_output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, cls=DecimalEncoder)

print(f"Saved {len(data)} posts to {unique_output_path}")

# Done, move on to clean-data.ipynb