In [None]:
import json
import pandas as pd
import boto3
from io import StringIO

# Initialize S3 client
s3_client = boto3.client('s3')

# Bucket details
input_bucket = "s3_bucket"
input_prefix = "input_folder/"
output_bucket = "s3_bucket"
output_prefix = "output_folder/"

### Flatten JSON without parent keys in the header

In [None]:
def flatten_json(nested_json, separator='_'):
    """Recursively flatten nested JSON and store only the innermost keys."""
    flattened = {}
    for key, value in nested_json.items():
        if isinstance(value, dict):
            # Recursively flatten the dictionary
            flattened.update(flatten_json(value, separator))  # Don't include parent key in recursion
        elif isinstance(value, list):
            # If a list contains dictionaries, handle them separately
            if all(isinstance(item, dict) for item in value):
                # For each dictionary in the list, flatten individually
                for i, item in enumerate(value):
                    flattened.update(flatten_json(item, separator))
            else:
                # If the list contains non-dictionaries, store them as a JSON string
                flattened[key] = json.dumps(value)
        else:
            # If it's a leaf value, add it to the flattened structure
            flattened[key] = value
    return flattened

#### Code explain

* This function flatten_json takes a nested JSON object and flattens it by extracting only the innermost keys while discarding the hierarchy.

- **Initial Setup:**
* 1. It initializes an empty dictionary flattened to store the flattened key-value pairs.
* 2. It iterates through the key-value pairs of the given JSON.

- **Handling Nested Dictionaries:**
* 1. If a value is a dictionary (dict), the function calls itself recursively to flatten it further.
* 2. It does not include the parent key in recursion, meaning that hierarchical keys (e.g., "parent.child" ) will not be preserved.

- **Handling Lists (list):**
* 1. If the list contains only dictionaries, each dictionary in the list is flattened separately.
* 2. If the list contains primitive values (strings, numbers, etc.), it is stored as a JSON string to keep the structure.

- **Handling Leaf Nodes:**
* 1. If a value is neither a dictionary nor a list (i.e., a primitive value), it is directly added to the flattened dictionary.

### Convert JSON to CSV

In [2]:
def json_to_csv(file_key):
    # Get the JSON content from S3
    response = s3_client.get_object(Bucket=input_bucket, Key=file_key)
    json_data = json.loads(response["Body"].read().decode("utf-8"))

    # Flatten the JSON structure
    flattened_data = flatten_json(json_data)

    # Convert JSON to DataFrame
    df = pd.DataFrame([flattened_data])

    # Save the CSV content to a StringIO buffer
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)

    # Define output file key (replace .json with .csv)
    output_file_key = f"{output_prefix}{file_key.split('/')[-1].replace('.json', '.csv')}"

    # Upload the CSV to S3
    s3_client.put_object(Bucket=output_bucket, Key=output_file_key, Body=csv_buffer.getvalue())

    print(f"Converted and uploaded: {output_file_key}")

### List JSON files from S3

In [3]:
def list_json_files():
    files = []
    response = s3_client.list_objects_v2(Bucket=input_bucket, Prefix=input_prefix)
    while "Contents" in response:
        for obj in response["Contents"]:
            if obj["Key"].endswith(".json"):
                files.append(obj["Key"])
        if response.get("IsTruncated"):
            response = s3_client.list_objects_v2(
                Bucket=input_bucket, Prefix=input_prefix,
                ContinuationToken=response["NextContinuationToken"]
            )
        else:
            break
    return files

In [None]:
# Main function to process all JSON files
def main():
    json_files = list_json_files()
    for file_key in json_files:
        json_to_csv(file_key)

if __name__ == "__main__":
    main()