# JSON File Operations

## Learning Objectives
By the end of this notebook, you will be able to:
- Read and write JSON files using Python's json module
- Parse nested JSON structures effectively
- Handle JSON parsing exceptions and errors
- Work with JSON Lines (JSONL) format
- Transform and manipulate JSON data for data engineering tasks

## 1. Basic JSON Reading and Writing

In [None]:
import json
from typing import Dict, List, Any, Union, Optional
import os

# Create sample JSON data
sample_user_data = {
    "user_id": "USR001",
    "name": "Alice Johnson",
    "email": "alice.johnson@example.com",
    "age": 28,
    "is_active": True,
    "preferences": {
        "theme": "dark",
        "notifications": True,
        "language": "en"
    },
    "skills": ["Python", "SQL", "Data Analysis"],
    "projects": [
        {
            "name": "Data Pipeline",
            "status": "completed",
            "start_date": "2024-01-15"
        },
        {
            "name": "Analytics Dashboard",
            "status": "in_progress",
            "start_date": "2024-02-01"
        }
    ]
}

print("Sample user data:")
print(json.dumps(sample_user_data, indent=2))

In [None]:
# Writing JSON to file
def write_json_file(data: Union[Dict[str, Any], List[Any]], filename: str, indent: int = 2) -> bool:
    """
    Write data to JSON file.
    
    Args:
        data: Data to write (dict or list)
        filename: Output filename
        indent: JSON indentation for readability
    
    Returns:
        True if successful, False otherwise
    """
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            json.dump(data, file, indent=indent, ensure_ascii=False)
        return True
    except Exception as e:
        print(f"Error writing JSON file: {e}")
        return False

# Write sample data to file
success = write_json_file(sample_user_data, 'user_data.json')
if success:
    print("Successfully wrote user_data.json")
    
    # Verify file contents
    with open('user_data.json', 'r', encoding='utf-8') as file:
        content = file.read()
        print("\nFile contents:")
        print(content[:200] + "..." if len(content) > 200 else content)

In [None]:
# Reading JSON from file
def read_json_file(filename: str) -> Optional[Union[Dict[str, Any], List[Any]]]:
    """
    Read JSON data from file.
    
    Args:
        filename: JSON file to read
    
    Returns:
        Parsed JSON data or None if error
    """
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"File not found: {filename}")
        return None
    except json.JSONDecodeError as e:
        print(f"Invalid JSON in {filename}: {e}")
        return None
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        return None

# Read the JSON file back
loaded_data = read_json_file('user_data.json')

if loaded_data:
    print("Successfully loaded JSON data:")
    print(f"User: {loaded_data['name']}")
    print(f"Email: {loaded_data['email']}")
    print(f"Skills: {loaded_data['skills']}")
    print(f"Number of projects: {len(loaded_data['projects'])}")
    
    # Access nested data
    theme = loaded_data['preferences']['theme']
    print(f"Preferred theme: {theme}")

## 2. Working with JSON Strings

In [None]:
# Converting between JSON strings and Python objects

# Python object to JSON string
product_data = {
    "product_id": "PRD001",
    "name": "Wireless Headphones",
    "price": 99.99,
    "in_stock": True,
    "categories": ["Electronics", "Audio"],
    "specifications": {
        "battery_life": "20 hours",
        "connectivity": "Bluetooth 5.0",
        "weight": "250g"
    }
}

# Convert to JSON string
json_string = json.dumps(product_data, indent=2)
print("Product data as JSON string:")
print(json_string)
print(f"\nJSON string type: {type(json_string)}")
print(f"JSON string length: {len(json_string)} characters")

In [None]:
# JSON string to Python object
json_response = '''
{
    "status": "success",
    "data": {
        "users": [
            {
                "id": 1,
                "name": "John Doe",
                "active": true
            },
            {
                "id": 2,
                "name": "Jane Smith",
                "active": false
            }
        ],
        "total_count": 2
    },
    "timestamp": "2024-01-15T10:30:00Z"
}
'''

# Parse JSON string
try:
    parsed_response = json.loads(json_response)
    print("Parsed JSON response:")
    print(f"Status: {parsed_response['status']}")
    print(f"Total users: {parsed_response['data']['total_count']}")
    print(f"Timestamp: {parsed_response['timestamp']}")
    
    # Process users
    users = parsed_response['data']['users']
    print("\nUsers:")
    for user in users:
        status = "Active" if user['active'] else "Inactive"
        print(f"  {user['name']} (ID: {user['id']}) - {status}")
        
except json.JSONDecodeError as e:
    print(f"Error parsing JSON: {e}")

## 3. Handling Nested JSON Structures

In [None]:
# Complex nested JSON structure
company_data = {
    "company": {
        "name": "TechCorp Inc.",
        "founded": 2010,
        "headquarters": {
            "address": "123 Tech Street",
            "city": "San Francisco",
            "state": "CA",
            "country": "USA",
            "coordinates": {
                "latitude": 37.7749,
                "longitude": -122.4194
            }
        }
    },
    "departments": [
        {
            "name": "Engineering",
            "head": "Alice Johnson",
            "employees": [
                {
                    "id": "ENG001",
                    "name": "Bob Wilson",
                    "role": "Senior Developer",
                    "skills": ["Python", "JavaScript", "SQL"],
                    "projects": [
                        {"name": "API Gateway", "status": "completed"},
                        {"name": "Data Pipeline", "status": "in_progress"}
                    ]
                },
                {
                    "id": "ENG002",
                    "name": "Carol Davis",
                    "role": "Data Engineer",
                    "skills": ["Python", "SQL", "Apache Spark"],
                    "projects": [
                        {"name": "ETL Pipeline", "status": "completed"}
                    ]
                }
            ]
        },
        {
            "name": "Marketing",
            "head": "David Brown",
            "employees": [
                {
                    "id": "MKT001",
                    "name": "Eve Miller",
                    "role": "Marketing Specialist",
                    "skills": ["Digital Marketing", "Analytics"],
                    "projects": [
                        {"name": "Campaign Analysis", "status": "in_progress"}
                    ]
                }
            ]
        }
    ]
}

# Save complex data
write_json_file(company_data, 'company_data.json')
print("Created complex company data JSON file")

In [None]:
# Function to safely access nested JSON data
def get_nested_value(data: Dict[str, Any], keys: List[str], default: Any = None) -> Any:
    """
    Safely access nested dictionary values.
    
    Args:
        data: Dictionary to search
        keys: List of keys to traverse
        default: Default value if path doesn't exist
    
    Returns:
        Value at nested path or default
    """
    current = data
    for key in keys:
        if isinstance(current, dict) and key in current:
            current = current[key]
        else:
            return default
    return current

# Load and analyze complex data
company = read_json_file('company_data.json')

if company:
    # Access nested company information
    company_name = get_nested_value(company, ['company', 'name'])
    city = get_nested_value(company, ['company', 'headquarters', 'city'])
    latitude = get_nested_value(company, ['company', 'headquarters', 'coordinates', 'latitude'])
    
    print(f"Company: {company_name}")
    print(f"Location: {city}")
    print(f"Latitude: {latitude}")
    
    # Process departments and employees
    departments = company.get('departments', [])
    print(f"\nDepartments: {len(departments)}")
    
    for dept in departments:
        dept_name = dept.get('name', 'Unknown')
        head = dept.get('head', 'Unknown')
        employees = dept.get('employees', [])
        
        print(f"\n{dept_name} Department:")
        print(f"  Head: {head}")
        print(f"  Employees: {len(employees)}")
        
        # List employees and their skills
        for emp in employees:
            name = emp.get('name', 'Unknown')
            role = emp.get('role', 'Unknown')
            skills = emp.get('skills', [])
            projects = emp.get('projects', [])
            
            print(f"    {name} ({role})")
            print(f"      Skills: {', '.join(skills)}")
            print(f"      Projects: {len(projects)}")

## 4. JSON Lines (JSONL) Format

In [None]:
# JSON Lines format - one JSON object per line
# This is common for streaming data and log files

# Create sample JSONL data
log_entries = [
    {"timestamp": "2024-01-15T10:00:00Z", "level": "INFO", "message": "Application started", "user_id": None},
    {"timestamp": "2024-01-15T10:01:00Z", "level": "INFO", "message": "User login", "user_id": "USR001"},
    {"timestamp": "2024-01-15T10:02:00Z", "level": "WARNING", "message": "High memory usage", "user_id": None},
    {"timestamp": "2024-01-15T10:03:00Z", "level": "ERROR", "message": "Database connection failed", "user_id": "USR002"},
    {"timestamp": "2024-01-15T10:04:00Z", "level": "INFO", "message": "User logout", "user_id": "USR001"}
]

# Write JSONL file
def write_jsonl_file(data: List[Dict[str, Any]], filename: str) -> bool:
    """
    Write data to JSON Lines file.
    
    Args:
        data: List of dictionaries to write
        filename: Output filename
    
    Returns:
        True if successful, False otherwise
    """
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            for record in data:
                json_line = json.dumps(record, ensure_ascii=False)
                file.write(json_line + '\n')
        return True
    except Exception as e:
        print(f"Error writing JSONL file: {e}")
        return False

# Write log entries to JSONL file
success = write_jsonl_file(log_entries, 'application.jsonl')
if success:
    print("Created application.jsonl file")
    
    # Show file contents
    with open('application.jsonl', 'r', encoding='utf-8') as file:
        content = file.read()
        print("\nJSONL file contents:")
        print(content)

In [None]:
# Read JSONL file
def read_jsonl_file(filename: str) -> List[Dict[str, Any]]:
    """
    Read JSON Lines file.
    
    Args:
        filename: JSONL file to read
    
    Returns:
        List of parsed JSON objects
    """
    records: List[Dict[str, Any]] = []
    
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                try:
                    record = json.loads(line)
                    records.append(record)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line {line_num}: {e}")
                    continue
                    
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except Exception as e:
        print(f"Error reading file: {e}")
    
    return records

# Read and analyze log data
logs = read_jsonl_file('application.jsonl')

print(f"Loaded {len(logs)} log entries")

# Analyze log levels
level_counts = {}
user_activities = {}

for log in logs:
    level = log.get('level', 'UNKNOWN')
    user_id = log.get('user_id')
    
    # Count log levels
    level_counts[level] = level_counts.get(level, 0) + 1
    
    # Track user activities
    if user_id:
        if user_id not in user_activities:
            user_activities[user_id] = []
        user_activities[user_id].append(log)

print(f"\nLog level distribution: {level_counts}")
print(f"Users with activities: {list(user_activities.keys())}")

# Show user activities
for user_id, activities in user_activities.items():
    print(f"\n{user_id} activities:")
    for activity in activities:
        timestamp = activity['timestamp']
        message = activity['message']
        print(f"  {timestamp}: {message}")