# JSON to CSV Converter

This notebook converts JSON files from web scraping data to CSV format. Features include:

- Automatic processing of all JSON files in the data directory
- Handles nested JSON structures
- Data validation and transformation
- Progress tracking for large files
- Error handling and logging
- UTF-8 encoding support

The script will create CSV files with the same names as the input JSON files.

In [1]:
import json
import csv
from pathlib import Path
import logging
from typing import Dict, List, Any, Optional
from tqdm import tqdm
import pandas as pd

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_form_fields(data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Extract form fields with specific columns from scraped data"""
    rows = []
    
    # Get input fields
    input_fields = data.get('input_fields', [])
    if isinstance(input_fields, list):
        for field in input_fields:
            row = {
                'url': data.get('url', ''),
                'name': field.get('name', ''),
                'input_type': field.get('type', ''),
                'required': field.get('required', False),
                'selector_hint': field.get('selector', ''),  # Extract selector hints if present
                'notes': field.get('notes', ''),  # Extract any notes
                'label': field.get('label', ''),  # Keep label for context
                'options': ','.join(str(opt) for opt in field.get('options', [])) if field.get('options') else ''
            }
            rows.append(row)
    
    return rows

def convert_json_to_csv(json_file: Path) -> None:
    """Convert scraped JSON data to CSV format focusing on form field properties"""
    try:
        # Read JSON file
        with json_file.open('r', encoding='utf-8') as jf:
            data = json.load(jf)
        
        if not data:
            logging.warning(f"No data in {json_file}")
            return
            
        # Handle single object or list of objects
        if not isinstance(data, list):
            data = [data]
            
        # Extract form fields from each entry
        all_rows = []
        for entry in data:
            rows = extract_form_fields(entry)
            all_rows.extend(rows)
            
        if not all_rows:
            logging.warning(f"No form fields found in {json_file}")
            return
            
        # Create CSV file with same name but .csv extension
        csv_file = json_file.with_suffix('.csv')
        
        # Define column order
        core_columns = [
            'url',
            'name',
            'input_type',
            'required',
            'selector_hint',
            'notes',
            'label',
            'options'
        ]
        
        # Write to CSV
        with csv_file.open('w', newline='', encoding='utf-8') as cf:
            writer = csv.DictWriter(cf, fieldnames=core_columns)
            writer.writeheader()
            writer.writerows(all_rows)
            
        # Validate and summarize
        df = pd.read_csv(csv_file)
        logging.info(f"Successfully converted {json_file} to {csv_file}")
        logging.info(f"Extracted {len(df)} form fields")
        
        # Print summary of input types found
        type_counts = df['input_type'].value_counts()
        logging.info("\nInput type summary:")
        for input_type, count in type_counts.items():
            logging.info(f"  {input_type}: {count} fields")
        
        # Print required fields summary
        required_count = df['required'].sum()
        logging.info(f"\nRequired fields: {required_count} out of {len(df)}")
        
    except json.JSONDecodeError:
        logging.error(f"Invalid JSON format in {json_file}")
    except Exception as e:
        logging.error(f"Error processing {json_file}: {str(e)}")

# Process all JSON files in the data directory
data_dir = Path('data')
json_files = list(data_dir.glob('*.json'))

if not json_files:
    logging.warning("No JSON files found in the data directory")
else:
    for json_file in tqdm(json_files, desc="Processing files"):
        convert_json_to_csv(json_file)
        
logging.info("Processing complete")

Processing files: 100%|██████████| 6/6 [00:00<00:00, 246.95it/s]
2025-08-12 23:41:42,085 - INFO - Processing complete

2025-08-12 23:41:42,085 - INFO - Processing complete


In [2]:
# Process just the avila json file
json_file = Path('data/https_www.avilacapllc.com_.json')
print(f"Processing {json_file.name}...")

# Read and show the JSON content first
with json_file.open('r', encoding='utf-8') as f:
    data = json.load(f)
    print("\nJSON content:")
    print(json.dumps(data, indent=2))

# Now convert to CSV
convert_json_to_csv(json_file)



Processing https_www.avilacapllc.com_.json...

JSON content:
{
  "url": "https://www.avilacapllc.com/contact-us",
  "html": "<!DOCTYPE html><html data-wf-domain=\"www.avilacapllc.com\" data-wf-page=\"654e78c50508a57000502b7f\" data-wf-site=\"654e78c50508a57000502b7c\" lang=\"en\" class=\"w-mod-js w-mod-ix wf-sweetsanspro-n6-active wf-sweetsanspro-n7-active wf-active\"><head><style>.wf-force-outline-none[tabindex=\"-1\"]:focus{outline:none;}</style><meta charset=\"utf-8\"><title>Avila Real Estate Capital</title><meta content=\"Avila Real Estate Capital is a full-service lender for developers and builders. We originate, underwrite and service loans for acquisition, development and construction projects. Our vertically integrated team is made up of developer and builder experts which enables us to provide responsive and competitive solutions for our clients.\" name=\"description\"><meta content=\"Avila Real Estate Capital\" property=\"og:title\"><meta content=\"Avila Real Estate Capital i

In [7]:
!pip install aiofiles

Defaulting to user installation because normal site-packages is not writeable
Collecting aiofiles
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Using cached aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0




In [11]:
import json, csv, aiofiles

async def json_to_csv(json_file, csv_file):
    # Async read JSON
    async with aiofiles.open(json_file, 'r') as jf:
        data = json.loads(await jf.read())

    # Sync write CSV
    with open(csv_file, 'w', newline='') as cf:
        writer = csv.DictWriter(cf, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

# In Jupyter
await json_to_csv(
    r'C:\Users\bhava\OneDrive\Desktop\DWS\data\https_www.avilacapllc.com_.json',
    r'C:\Users\bhava\OneDrive\Desktop\DWS\data\https_www.avilacapllc.com_.csv'
)


  for group in groupby(strings, lambda s: s[0] == first[0])) \


KeyError: 0