# Zip and Unpacking

## Learning Objectives
By the end of this notebook, you will be able to:
- Use the zip() function to combine multiple iterables
- Unpack sequences using the * operator
- Apply zip and unpacking for data alignment and transformation
- Work with parallel data structures effectively
- Use these techniques in real-world data engineering scenarios

## 1. Basic Zip Operations

In [None]:
# Basic zip usage - combining two lists
names = ["Alice", "Bob", "Charlie", "Diana"]
ages = [25, 30, 35, 28]

# Zip creates pairs of corresponding elements
name_age_pairs = list(zip(names, ages))
print(f"Names: {names}")
print(f"Ages: {ages}")
print(f"Name-Age pairs: {name_age_pairs}")

# Iterating through zipped data
print("\nIterating through zipped data:")
for name, age in zip(names, ages):
    print(f"{name} is {age} years old")

# Zip with multiple lists
cities = ["New York", "London", "Tokyo", "Paris"]
salaries = [75000, 65000, 80000, 70000]

employee_data = list(zip(names, ages, cities, salaries))
print(f"\nComplete employee data: {employee_data}")

# Creating dictionaries from zipped data
name_to_age = dict(zip(names, ages))
name_to_city = dict(zip(names, cities))

print(f"\nName to age mapping: {name_to_age}")
print(f"Name to city mapping: {name_to_city}")

## 2. Zip with Different Length Iterables

In [None]:
# Zip stops at the shortest iterable
short_list = ["A", "B", "C"]
long_list = [1, 2, 3, 4, 5, 6]

zipped_short = list(zip(short_list, long_list))
print(f"Short list: {short_list}")
print(f"Long list: {long_list}")
print(f"Zipped result: {zipped_short}")
print(f"Note: Zip stopped at length {len(zipped_short)} (shortest list)")

# Using itertools.zip_longest for different behavior (if needed)
from itertools import zip_longest

zipped_longest = list(zip_longest(short_list, long_list, fillvalue="N/A"))
print(f"\nUsing zip_longest with fillvalue: {zipped_longest}")

# Practical example: Processing incomplete data
product_names = ["Laptop", "Mouse", "Keyboard"]
product_prices = [999.99, 29.99]  # Missing price for keyboard
product_categories = ["Electronics", "Electronics", "Electronics", "Accessories"]

# Safe zipping - only process complete records
complete_products = list(zip(product_names, product_prices, product_categories))
print(f"\nComplete product records: {complete_products}")
print(f"Records processed: {len(complete_products)} out of {len(product_names)} products")

## 3. Unpacking Sequences

In [None]:
# Basic unpacking
coordinates = (10.5, 20.3)
x, y = coordinates
print(f"Coordinates: {coordinates}")
print(f"X: {x}, Y: {y}")

# Unpacking lists
rgb_color = [255, 128, 64]
red, green, blue = rgb_color
print(f"\nRGB color: {rgb_color}")
print(f"Red: {red}, Green: {green}, Blue: {blue}")

# Unpacking with * operator (rest of elements)
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Get first, last, and middle elements
first, *middle, last = numbers
print(f"\nNumbers: {numbers}")
print(f"First: {first}")
print(f"Middle: {middle}")
print(f"Last: {last}")

# Get first two and rest
first_two, *rest = numbers[:2], numbers[2:]
# Alternative syntax:
a, b, *remaining = numbers
print(f"\nFirst two: {a}, {b}")
print(f"Remaining: {remaining}")

# Unpacking in function calls
def calculate_total(item1, item2, item3):
    return item1 + item2 + item3

prices = [29.99, 15.50, 8.75]
total = calculate_total(*prices)  # Unpacking list as arguments
print(f"\nPrices: {prices}")
print(f"Total: ${total:.2f}")

# Unpacking dictionaries in function calls
def create_user_profile(name, age, city):
    return {
        "name": name,
        "age": age,
        "city": city,
        "created_at": "2024-01-15"
    }

user_data = {"name": "Alice", "age": 25, "city": "New York"}
profile = create_user_profile(**user_data)  # Unpacking dict as keyword arguments
print(f"\nUser data: {user_data}")
print(f"Profile: {profile}")

## 4. Zip and Unpack Together

In [None]:
# Transposing data using zip and unpacking
# Original data: rows of student information
student_data = [
    ("Alice", 85, "A"),
    ("Bob", 92, "A"),
    ("Charlie", 78, "B"),
    ("Diana", 96, "A")
]

print(f"Student data (rows): {student_data}")

# Transpose to get columns
names, scores, grades = zip(*student_data)
print(f"\nTransposed data (columns):")
print(f"Names: {names}")
print(f"Scores: {scores}")
print(f"Grades: {grades}")

# Convert back to lists for easier manipulation
names_list = list(names)
scores_list = list(scores)
grades_list = list(grades)

# Calculate statistics on the columns
average_score = sum(scores_list) / len(scores_list)
max_score = max(scores_list)
min_score = min(scores_list)

print(f"\nScore statistics:")
print(f"Average: {average_score:.2f}")
print(f"Maximum: {max_score}")
print(f"Minimum: {min_score}")

# Working with matrix data
matrix = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

print(f"\nOriginal matrix:")
for row in matrix:
    print(row)

# Transpose matrix using zip and unpacking
transposed = list(zip(*matrix))
print(f"\nTransposed matrix:")
for row in transposed:
    print(list(row))

# Extract specific columns
col1, col2, col3 = zip(*matrix)
print(f"\nColumn 1: {col1}")
print(f"Column 2: {col2}")
print(f"Column 3: {col3}")

## 5. Practical Data Engineering Examples

In [None]:
# Example 1: Processing CSV-like data
csv_headers = ["product_id", "name", "price", "quantity", "category"]
csv_rows = [
    ["PRD001", "Laptop", 999.99, 10, "Electronics"],
    ["PRD002", "Mouse", 29.99, 50, "Electronics"],
    ["PRD003", "Desk", 299.99, 5, "Furniture"],
    ["PRD004", "Chair", 199.99, 8, "Furniture"]
]

print(f"CSV Headers: {csv_headers}")
print(f"CSV Rows: {csv_rows}")

# Convert to list of dictionaries using zip
products = [
    dict(zip(csv_headers, row)) for row in csv_rows
]

print(f"\nConverted to dictionaries:")
for product in products:
    print(f"  {product}")

# Extract specific columns using zip and unpacking
product_ids, names, prices, quantities, categories = zip(*csv_rows)

print(f"\nExtracted columns:")
print(f"Product IDs: {product_ids}")
print(f"Names: {names}")
print(f"Prices: {prices}")

# Calculate total inventory value
total_values = [price * qty for price, qty in zip(prices, quantities)]
total_inventory_value = sum(total_values)

print(f"\nTotal values per product: {total_values}")
print(f"Total inventory value: ${total_inventory_value:,.2f}")

# Example 2: Aligning time series data
dates = ["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"]
temperatures = [22.5, 24.1, 23.8, 25.2]
humidity = [65.2, 63.8, 64.5, 62.1]
pressure = [1013.2, 1015.1, 1012.8, 1016.3]

# Combine all weather data
weather_data = list(zip(dates, temperatures, humidity, pressure))

print(f"\nWeather data:")
print(f"Date\t\tTemp\tHumidity\tPressure")
for date, temp, hum, press in weather_data:
    print(f"{date}\t{temp}°C\t{hum}%\t\t{press} hPa")

# Calculate daily comfort index (example formula)
comfort_indices = [
    temp - (hum / 10) + (press - 1013) / 10
    for temp, hum, press in zip(temperatures, humidity, pressure)
]

print(f"\nDaily comfort indices: {[f'{idx:.2f}' for idx in comfort_indices]}")

# Example 3: Data validation across multiple sources
user_ids = ["U001", "U002", "U003", "U004"]
user_names = ["Alice", "Bob", "Charlie", "Diana"]
user_emails = ["alice@email.com", "bob@email.com", "charlie@email.com", "diana@email.com"]
user_ages = [25, 30, 35, 28]
user_active = [True, True, False, True]

# Validate that all lists have the same length
data_sources = [user_ids, user_names, user_emails, user_ages, user_active]
lengths = [len(source) for source in data_sources]

if len(set(lengths)) == 1:
    print(f"\nData validation: All sources have {lengths[0]} records ✓")
    
    # Create complete user records
    complete_users = [
        {
            "user_id": uid,
            "name": name,
            "email": email,
            "age": age,
            "is_active": active
        }
        for uid, name, email, age, active in zip(user_ids, user_names, user_emails, user_ages, user_active)
    ]
    
    print(f"\nComplete user records:")
    for user in complete_users:
        status = "Active" if user["is_active"] else "Inactive"
        print(f"  {user['user_id']}: {user['name']} ({user['age']}) - {status}")
        
    # Filter active users only
    active_users = [
        user for user in complete_users if user["is_active"]
    ]
    
    print(f"\nActive users: {len(active_users)} out of {len(complete_users)}")
    
else:
    print(f"\nData validation: Mismatched lengths {lengths} ✗")

## 6. Advanced Zip and Unpacking Techniques

In [None]:
# Enumerate with zip for indexed processing
products = ["Laptop", "Mouse", "Keyboard", "Monitor"]
prices = [999.99, 29.99, 79.99, 299.99]

print("Indexed product processing:")
for index, (product, price) in enumerate(zip(products, prices)):
    print(f"{index + 1}. {product}: ${price:.2f}")

# Zip with range for creating sequences
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun"]
sales = [15000, 18000, 22000, 19000, 25000, 21000]

# Create month numbers with sales data
monthly_sales = list(zip(range(1, 7), months, sales))

print(f"\nMonthly sales data:")
for month_num, month_name, sale_amount in monthly_sales:
    print(f"Month {month_num} ({month_name}): ${sale_amount:,}")

# Grouping data using zip
def group_data_by_pairs(data):
    # Group list elements into pairs
    # Use zip with slicing to create pairs
    return list(zip(data[::2], data[1::2]))

numbers = [1, 2, 3, 4, 5, 6, 7, 8]
pairs = group_data_by_pairs(numbers)

print(f"\nOriginal numbers: {numbers}")
print(f"Grouped pairs: {pairs}")

# Calculate pair sums
pair_sums = [a + b for a, b in pairs]
print(f"Pair sums: {pair_sums}")

# Rotating/shifting data using zip
def rotate_lists(list1, list2, shift=1):
    # Rotate two lists by a given shift amount
    rotated1 = list1[shift:] + list1[:shift]
    rotated2 = list2[shift:] + list2[:shift]
    return rotated1, rotated2

days = ["Mon", "Tue", "Wed", "Thu", "Fri"]
tasks = ["Meeting", "Code", "Review", "Test", "Deploy"]

print(f"\nOriginal schedule:")
for day, task in zip(days, tasks):
    print(f"{day}: {task}")

# Rotate by 2 positions
rotated_days, rotated_tasks = rotate_lists(days, tasks, 2)

print(f"\nRotated schedule (shift by 2):")
for day, task in zip(rotated_days, rotated_tasks):
    print(f"{day}: {task}")

# Merging sorted sequences
def merge_sorted_sequences(*sequences):
    # Merge multiple sorted sequences into one sorted sequence
    # Flatten all sequences and sort
    all_elements = []
    for seq in sequences:
        all_elements.extend(seq)
    return sorted(all_elements)

seq1 = [1, 4, 7, 10]
seq2 = [2, 5, 8, 11]
seq3 = [3, 6, 9, 12]

merged = merge_sorted_sequences(seq1, seq2, seq3)

print(f"\nSequence 1: {seq1}")
print(f"Sequence 2: {seq2}")
print(f"Sequence 3: {seq3}")
print(f"Merged: {merged}")

## Practice Exercises

Complete the following exercises to reinforce your understanding:

In [None]:
# Exercise 1: Data alignment and processing
product_codes = ["A001", "B002", "C003", "D004", "E005"]
product_names = ["Widget A", "Gadget B", "Tool C", "Device D", "Item E"]
unit_costs = [10.50, 25.75, 15.25, 30.00, 8.95]
quantities = [100, 50, 75, 25, 200]

# TODO: Use zip to create a list of dictionaries with complete product information
# Each dictionary should have keys: 'code', 'name', 'unit_cost', 'quantity', 'total_value'
# Calculate total_value as unit_cost * quantity

# complete_products = [your code here]
# print(f"Complete products: {complete_products}")

# TODO: Use zip and unpacking to extract all unit costs and calculate average
# avg_unit_cost = [your code here]
# print(f"Average unit cost: ${avg_unit_cost:.2f}")

In [None]:
# Exercise 2: Matrix operations with zip and unpacking
sales_matrix = [
    [1000, 1200, 1100, 1300],  # Q1, Q2, Q3, Q4 for Product A
    [800, 900, 950, 1000],     # Q1, Q2, Q3, Q4 for Product B
    [1500, 1600, 1400, 1700]   # Q1, Q2, Q3, Q4 for Product C
]

product_names = ["Product A", "Product B", "Product C"]
quarters = ["Q1", "Q2", "Q3", "Q4"]

print(f"Sales matrix: {sales_matrix}")

# TODO: Use zip and unpacking to transpose the matrix and calculate quarterly totals
# quarterly_totals should be a list of total sales for each quarter across all products

# quarterly_totals = [your code here]
# print(f"Quarterly totals: {quarterly_totals}")

# TODO: Create a dictionary mapping quarter names to their totals
# quarter_summary = [your code here]
# print(f"Quarter summary: {quarter_summary}")

# TODO: Find the best performing quarter
# best_quarter = [your code here]
# print(f"Best performing quarter: {best_quarter}")

In [None]:
# Exercise 3: Data synchronization and validation
timestamps = ["10:00", "10:15", "10:30", "10:45", "11:00"]
sensor_a_readings = [23.5, 24.1, 23.8, 24.5, 24.2]
sensor_b_readings = [65.2, 63.8, 64.5, 62.1]  # Missing one reading
sensor_c_readings = [1013.2, 1015.1, 1012.8, 1016.3, 1014.5, 1017.2]  # Extra readings

# TODO: Create a function that synchronizes all sensor data
# It should only include timestamps where all sensors have readings
# Return a list of tuples: (timestamp, sensor_a, sensor_b, sensor_c)

def synchronize_sensor_data(timestamps, sensor_a, sensor_b, sensor_c):
    # Synchronize sensor readings to only include complete data points
    # TODO: Implement this function using zip
    pass

# synchronized_data = synchronize_sensor_data(timestamps, sensor_a_readings, sensor_b_readings, sensor_c_readings)
# print(f"Synchronized data: {synchronized_data}")
# Expected: Should have 4 complete readings (limited by sensor_b which has 4 readings)

In [None]:
# Exercise 4: Advanced data transformation
employee_data = [
    ("Alice", "Engineering", 5, 75000),
    ("Bob", "Marketing", 3, 65000),
    ("Charlie", "Engineering", 7, 85000),
    ("Diana", "Sales", 4, 70000),
    ("Eve", "Marketing", 6, 72000)
]

# Data format: (name, department, years_experience, salary)

# TODO: Use unpacking to separate the data into individual lists
# names, departments, experience, salaries = [your code here]

# TODO: Calculate department averages using zip and the separated data
# Create a dictionary mapping department names to average salaries
# Hint: You'll need to group by department first

# dept_averages = [your code here]
# print(f"Department salary averages: {dept_averages}")

# TODO: Create a new list of tuples that includes a bonus calculation
# Bonus = 1000 * years_experience
# New format: (name, department, salary, bonus, total_compensation)

# enhanced_data = [your code here]
# print(f"Enhanced employee data: {enhanced_data}")