# Dictionary Operations for Data Engineering

## Learning Objectives
By the end of this notebook, you will be able to:
- Create and manipulate Python dictionaries
- Access, add, update, and remove dictionary elements
- Use dictionary methods for data processing
- Work with nested dictionaries for complex data structures
- Apply dictionaries to common data engineering scenarios

## 1. Creating and Basic Dictionary Operations

In [None]:
# Creating dictionaries
student_grades = {
    "Alice": 85,
    "Bob": 92,
    "Charlie": 78
}

# Dictionary with mixed value types
user_profile = {
    "user_id": 12345,
    "name": "John Doe",
    "email": "john.doe@example.com",
    "age": 28,
    "is_active": True,
    "skills": ["Python", "SQL", "Data Analysis"]
}

print(f"Student grades: {student_grades}")
print(f"User profile: {user_profile}")
print(f"Dictionary length: {len(user_profile)}")

## 2. Accessing Dictionary Elements

In [None]:
# Different ways to access dictionary values
product_info = {
    "product_id": "PRD001",
    "name": "Laptop",
    "price": 999.99,
    "category": "Electronics",
    "in_stock": True
}

# Direct access (raises KeyError if key doesn't exist)
product_name = product_info["name"]
print(f"Product name: {product_name}")

# Safe access using get() method
price = product_info.get("price")
discount = product_info.get("discount")  # Returns None if not found
warranty = product_info.get("warranty", "No warranty")  # Default value

print(f"Price: {price}")
print(f"Discount: {discount}")
print(f"Warranty: {warranty}")

# Check if key exists
has_category = "category" in product_info
has_weight = "weight" in product_info

print(f"Has category: {has_category}")
print(f"Has weight: {has_weight}")

## 3. Adding, Updating, and Removing Elements

In [None]:
# Start with a basic employee record
employee = {
    "emp_id": "EMP001",
    "name": "Alice Johnson",
    "department": "Engineering"
}

print(f"Initial employee: {employee}")

# Adding new key-value pairs
employee["salary"] = 75000
employee["hire_date"] = "2024-01-15"
employee["skills"] = ["Python", "SQL"]

print(f"After additions: {employee}")

# Updating existing values
employee["salary"] = 80000  # Promotion!
employee["skills"].append("Machine Learning")  # New skill

print(f"After updates: {employee}")

# Removing elements
removed_value = employee.pop("hire_date")  # Remove and return value
print(f"Removed hire_date: {removed_value}")

# Remove key if it exists, otherwise return default
bonus = employee.pop("bonus", 0)
print(f"Bonus (default): {bonus}")

# Delete using del keyword
del employee["skills"]
print(f"After deletion: {employee}")

## 4. Dictionary Methods and Iteration

In [None]:
# Sample sales data for demonstration
monthly_sales = {
    "January": 15000,
    "February": 18000,
    "March": 22000,
    "April": 19000
}

print(f"Monthly sales: {monthly_sales}")
print()

# Get all keys, values, and items
months = list(monthly_sales.keys())
sales_amounts = list(monthly_sales.values())
sales_items = list(monthly_sales.items())

print(f"Months: {months}")
print(f"Sales amounts: {sales_amounts}")
print(f"Sales items: {sales_items}")
print()

# Iterating through dictionaries
print("Iterating through keys:")
for month in monthly_sales:
    print(f"{month}: ${monthly_sales[month]:,}")

print("\nIterating through key-value pairs:")
for month, sales in monthly_sales.items():
    print(f"{month}: ${sales:,}")

print("\nIterating through values only:")
for sales in monthly_sales.values():
    print(f"${sales:,}")

# Calculate total and average
total_sales = sum(monthly_sales.values())
average_sales = total_sales / len(monthly_sales)

print(f"\nTotal sales: ${total_sales:,}")
print(f"Average monthly sales: ${average_sales:,.2f}")

## 5. Dictionary Merging and Updating

In [None]:
# Basic user information
basic_info = {
    "user_id": "USR001",
    "name": "Sarah Wilson",
    "email": "sarah@example.com"
}

# Additional contact information
contact_info = {
    "phone": "+1-555-0123",
    "address": "123 Main St, City, State",
    "email": "sarah.wilson@company.com"  # This will override the email
}

# Work information
work_info = {
    "department": "Data Engineering",
    "position": "Senior Data Engineer",
    "start_date": "2023-06-01"
}

print(f"Basic info: {basic_info}")
print(f"Contact info: {contact_info}")
print(f"Work info: {work_info}")
print()

# Method 1: Using update() method
complete_profile_v1 = basic_info.copy()
complete_profile_v1.update(contact_info)
complete_profile_v1.update(work_info)

print(f"Complete profile (using update): {complete_profile_v1}")
print()

# Method 2: Using dictionary unpacking (Python 3.5+)
complete_profile_v2 = {**basic_info, **contact_info, **work_info}
print(f"Complete profile (using unpacking): {complete_profile_v2}")
print()

# Method 3: Using union operator (Python 3.9+)
# complete_profile_v3 = basic_info | contact_info | work_info
# print(f"Complete profile (using union): {complete_profile_v3}")

# Selective merging - only add keys that don't exist
defaults = {
    "theme": "dark",
    "notifications": True,
    "language": "en"
}

user_settings = {
    "theme": "light",  # User preference overrides default
    "timezone": "UTC-5"
}

# Merge with defaults, keeping user preferences
final_settings = {**defaults, **user_settings}
print(f"Final settings: {final_settings}")

## 6. Nested Dictionaries

In [None]:
# Complex nested data structure for a company
company_data = {
    "company_info": {
        "name": "TechCorp",
        "founded": 2010,
        "headquarters": "San Francisco, CA"
    },
    "departments": {
        "engineering": {
            "head": "Alice Johnson",
            "employees": 25,
            "budget": 2500000
        },
        "marketing": {
            "head": "Bob Smith",
            "employees": 12,
            "budget": 800000
        },
        "sales": {
            "head": "Charlie Brown",
            "employees": 18,
            "budget": 1200000
        }
    },
    "quarterly_revenue": {
        "Q1": 1500000,
        "Q2": 1800000,
        "Q3": 2100000,
        "Q4": 2300000
    }
}

# Accessing nested data
company_name = company_data["company_info"]["name"]
engineering_head = company_data["departments"]["engineering"]["head"]
q3_revenue = company_data["quarterly_revenue"]["Q3"]

print(f"Company: {company_name}")
print(f"Engineering head: {engineering_head}")
print(f"Q3 Revenue: ${q3_revenue:,}")
print()

# Safe access to nested data
def safe_get_nested(data, keys, default=None):
    # Safely access nested dictionary values
    current = data
    for key in keys:
        if isinstance(current, dict) and key in current:
            current = current[key]
        else:
            return default
    return current

# Test safe access
hr_budget = safe_get_nested(company_data, ["departments", "hr", "budget"], 0)
marketing_employees = safe_get_nested(company_data, ["departments", "marketing", "employees"])

print(f"HR budget (safe access): ${hr_budget:,}")
print(f"Marketing employees: {marketing_employees}")

# Calculate department statistics
departments = company_data["departments"]
total_employees = sum(dept["employees"] for dept in departments.values())
total_budget = sum(dept["budget"] for dept in departments.values())

print(f"\nTotal employees: {total_employees}")
print(f"Total department budgets: ${total_budget:,}")

# Department summary
print("\nDepartment Summary:")
for dept_name, dept_info in departments.items():
    print(f"{dept_name.title()}: {dept_info['employees']} employees, ${dept_info['budget']:,} budget")

## 7. Dictionary Comprehensions and Transformations

In [None]:
# Sample data for transformations
raw_scores = {
    "Alice": "85",
    "Bob": "92",
    "Charlie": "78",
    "Diana": "96",
    "Eve": "88"
}

# Convert string scores to integers
numeric_scores = {name: int(score) for name, score in raw_scores.items()}
print(f"Numeric scores: {numeric_scores}")

# Filter high performers (score >= 90)
high_performers = {name: score for name, score in numeric_scores.items() if score >= 90}
print(f"High performers: {high_performers}")

# Create grade categories
def get_grade(score):
    if score >= 90:
        return "A"
    elif score >= 80:
        return "B"
    elif score >= 70:
        return "C"
    else:
        return "F"

grades = {name: get_grade(score) for name, score in numeric_scores.items()}
print(f"Grades: {grades}")

# Invert dictionary (swap keys and values)
score_to_name = {score: name for name, score in numeric_scores.items()}
print(f"Score to name mapping: {score_to_name}")

# Group by grade
students_by_grade = {}
for name, grade in grades.items():
    if grade not in students_by_grade:
        students_by_grade[grade] = []
    students_by_grade[grade].append(name)

print(f"Students by grade: {students_by_grade}")

## 8. Practical Data Engineering Examples

In [None]:
# Example 1: Processing log data
log_entries = [
    {"timestamp": "2024-01-15 10:30:00", "level": "INFO", "message": "User login", "user_id": "user1"},
    {"timestamp": "2024-01-15 10:31:00", "level": "ERROR", "message": "Database connection failed", "user_id": "user2"},
    {"timestamp": "2024-01-15 10:32:00", "level": "INFO", "message": "User logout", "user_id": "user1"},
    {"timestamp": "2024-01-15 10:33:00", "level": "WARNING", "message": "High memory usage", "user_id": "user3"},
    {"timestamp": "2024-01-15 10:34:00", "level": "ERROR", "message": "API timeout", "user_id": "user2"}
]

# Count log levels
level_counts = {}
for entry in log_entries:
    level = entry["level"]
    level_counts[level] = level_counts.get(level, 0) + 1

print(f"Log level counts: {level_counts}")

# Group by user
user_activities = {}
for entry in log_entries:
    user_id = entry["user_id"]
    if user_id not in user_activities:
        user_activities[user_id] = []
    user_activities[user_id].append(entry)

print(f"\nUser activities:")
for user_id, activities in user_activities.items():
    print(f"{user_id}: {len(activities)} activities")

# Example 2: Product inventory management
inventory = {
    "PRD001": {"name": "Laptop", "quantity": 50, "price": 999.99, "category": "Electronics"},
    "PRD002": {"name": "Mouse", "quantity": 200, "price": 29.99, "category": "Electronics"},
    "PRD003": {"name": "Desk", "quantity": 25, "price": 299.99, "category": "Furniture"},
    "PRD004": {"name": "Chair", "quantity": 30, "price": 199.99, "category": "Furniture"}
}

# Calculate total inventory value
total_value = sum(item["quantity"] * item["price"] for item in inventory.values())
print(f"\nTotal inventory value: ${total_value:,.2f}")

# Find low stock items (quantity < 30)
low_stock = {
    product_id: details for product_id, details in inventory.items() 
    if details["quantity"] < 30
}
print(f"Low stock items: {list(low_stock.keys())}")

# Group by category
by_category = {}
for product_id, details in inventory.items():
    category = details["category"]
    if category not in by_category:
        by_category[category] = []
    by_category[category].append(product_id)

print(f"Products by category: {by_category}")

## Practice Exercises

Complete the following exercises to reinforce your understanding:

In [None]:
# Exercise 1: Create a function to merge customer data
def merge_customer_data(basic_data, additional_data):
    # Merge customer data dictionaries, with additional_data taking precedence
    # TODO: Implement this function
    pass

# Test data
basic_customer = {
    "customer_id": "CUST001",
    "name": "John Doe",
    "email": "john@old-email.com"
}

additional_customer = {
    "email": "john@new-email.com",
    "phone": "+1-555-0123",
    "address": "123 Main St"
}

# Expected: email should be updated, other fields added
# merged = merge_customer_data(basic_customer, additional_customer)
# print(f"Merged customer: {merged}")

In [None]:
# Exercise 2: Count occurrences in a list of dictionaries
def count_by_field(records, field_name):
    # Count occurrences of values for a specific field across records
    # TODO: Implement this function
    pass

# Test data
sales_records = [
    {"product": "Laptop", "category": "Electronics", "region": "North"},
    {"product": "Mouse", "category": "Electronics", "region": "South"},
    {"product": "Desk", "category": "Furniture", "region": "North"},
    {"product": "Chair", "category": "Furniture", "region": "North"},
    {"product": "Monitor", "category": "Electronics", "region": "South"}
]

# Expected for "category": {"Electronics": 3, "Furniture": 2}
# category_counts = count_by_field(sales_records, "category")
# print(f"Category counts: {category_counts}")

# Expected for "region": {"North": 3, "South": 2}
# region_counts = count_by_field(sales_records, "region")
# print(f"Region counts: {region_counts}")

In [None]:
# Exercise 3: Transform nested data structure
def flatten_nested_dict(nested_dict, separator="_"):
    # Flatten a nested dictionary by combining keys with a separator
    # Example: {"user": {"name": "John", "age": 30}} -> {"user_name": "John", "user_age": 30}
    # TODO: Implement this function
    pass

# Test data
nested_data = {
    "user": {
        "name": "Alice",
        "age": 30
    },
    "account": {
        "balance": 1500.50,
        "type": "checking"
    }
}

# Expected: {"user_name": "Alice", "user_age": 30, "account_balance": 1500.50, "account_type": "checking"}
# flattened = flatten_nested_dict(nested_data)
# print(f"Flattened: {flattened}")