# Parquet File Operations

## Learning Objectives
By the end of this notebook, you will be able to:
- Understand what Parquet files are and their advantages
- Read and write Parquet files using pandas
- Work with Parquet file metadata and schema
- Handle large datasets efficiently with Parquet
- Convert between different file formats (CSV, JSON, Parquet)

## 1. Introduction to Parquet Format

In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
import os
from datetime import datetime, timedelta

# What is Parquet?
print("Apache Parquet is a columnar storage file format that provides:")
print("✓ Efficient compression and encoding")
print("✓ Fast query performance")
print("✓ Schema evolution support")
print("✓ Cross-language compatibility")
print("✓ Optimized for analytics workloads")
print()
print("Advantages over CSV:")
print("• Smaller file sizes (better compression)")
print("• Faster read/write operations")
print("• Preserves data types")
print("• Supports nested data structures")
print("• Built-in metadata and schema information")

## 2. Creating Sample Data

In [None]:
# Create sample sales data for demonstration
def generate_sample_sales_data(num_records: int = 10000) -> pd.DataFrame:
    """
    Generate sample sales data for testing.
    
    Args:
        num_records: Number of records to generate
    
    Returns:
        DataFrame with sample sales data
    """
    np.random.seed(42)  # For reproducible results
    
    # Generate date range
    start_date = datetime(2023, 1, 1)
    dates = [start_date + timedelta(days=x) for x in range(365)]
    
    # Sample data
    products = ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam', 'Headphones', 'Tablet', 'Phone']
    categories = ['Electronics', 'Accessories', 'Computing', 'Mobile']
    regions = ['North', 'South', 'East', 'West', 'Central']
    
    data = {
        'transaction_id': [f'TXN{i:06d}' for i in range(1, num_records + 1)],
        'date': np.random.choice(dates, num_records),
        'product': np.random.choice(products, num_records),
        'category': np.random.choice(categories, num_records),
        'region': np.random.choice(regions, num_records),
        'quantity': np.random.randint(1, 10, num_records),
        'unit_price': np.round(np.random.uniform(10.0, 500.0, num_records), 2),
        'discount_percent': np.round(np.random.uniform(0.0, 20.0, num_records), 1),
        'customer_id': [f'CUST{np.random.randint(1, 1000):04d}' for _ in range(num_records)],
        'is_online': np.random.choice([True, False], num_records)
    }
    
    df = pd.DataFrame(data)
    
    # Calculate derived fields
    df['subtotal'] = df['quantity'] * df['unit_price']
    df['discount_amount'] = df['subtotal'] * (df['discount_percent'] / 100)
    df['total_amount'] = df['subtotal'] - df['discount_amount']
    
    return df

# Generate sample data
sales_df = generate_sample_sales_data(10000)

print(f"Generated {len(sales_df):,} sales records")
print(f"\nDataFrame info:")
print(sales_df.info())
print(f"\nFirst few records:")
print(sales_df.head())

## 3. Writing Parquet Files

In [None]:
# Basic Parquet writing
def write_parquet_file(df: pd.DataFrame, filename: str, compression: str = 'snappy') -> bool:
    """
    Write DataFrame to Parquet file.
    
    Args:
        df: DataFrame to write
        filename: Output filename
        compression: Compression algorithm ('snappy', 'gzip', 'brotli', 'lz4')
    
    Returns:
        True if successful, False otherwise
    """
    try:
        df.to_parquet(filename, compression=compression, index=False)
        return True
    except Exception as e:
        print(f"Error writing Parquet file: {e}")
        return False

# Write with different compression algorithms
compression_types = ['snappy', 'gzip', 'brotli']

for compression in compression_types:
    filename = f'sales_data_{compression}.parquet'
    success = write_parquet_file(sales_df, filename, compression)
    
    if success:
        file_size = os.path.getsize(filename)
        print(f"{compression.upper()} compression: {filename} ({file_size:,} bytes)")

# Compare with CSV size
csv_filename = 'sales_data.csv'
sales_df.to_csv(csv_filename, index=False)
csv_size = os.path.getsize(csv_filename)
print(f"\nCSV file: {csv_filename} ({csv_size:,} bytes)")

# Calculate compression ratios
print("\nCompression ratios compared to CSV:")
for compression in compression_types:
    filename = f'sales_data_{compression}.parquet'
    if os.path.exists(filename):
        parquet_size = os.path.getsize(filename)
        ratio = (csv_size - parquet_size) / csv_size * 100
        print(f"{compression.upper()}: {ratio:.1f}% smaller than CSV")

## 4. Reading Parquet Files

In [None]:
# Basic Parquet reading
def read_parquet_file(filename: str, columns: Optional[List[str]] = None) -> Optional[pd.DataFrame]:
    """
    Read Parquet file with optional column selection.
    
    Args:
        filename: Parquet file to read
        columns: Specific columns to read (None for all)
    
    Returns:
        DataFrame or None if error
    """
    try:
        df = pd.read_parquet(filename, columns=columns)
        return df
    except Exception as e:
        print(f"Error reading Parquet file: {e}")
        return None

# Read the entire file
loaded_df = read_parquet_file('sales_data_snappy.parquet')

if loaded_df is not None:
    print(f"Loaded {len(loaded_df):,} records from Parquet file")
    print(f"\nDataFrame info:")
    print(loaded_df.info())
    
    # Verify data integrity
    print(f"\nData integrity check:")
    print(f"Original records: {len(sales_df):,}")
    print(f"Loaded records: {len(loaded_df):,}")
    print(f"Data matches: {sales_df.equals(loaded_df)}")

# Read only specific columns (column pruning)
selected_columns = ['transaction_id', 'date', 'product', 'total_amount']
partial_df = read_parquet_file('sales_data_snappy.parquet', columns=selected_columns)

if partial_df is not None:
    print(f"\nLoaded {len(partial_df.columns)} columns: {list(partial_df.columns)}")
    print(f"Sample data:")
    print(partial_df.head())

## 5. Working with Parquet Metadata

In [None]:
import pyarrow.parquet as pq

# Read Parquet file metadata
def get_parquet_metadata(filename: str) -> Dict[str, Any]:
    """
    Get metadata information from Parquet file.
    
    Args:
        filename: Parquet file to analyze
    
    Returns:
        Dictionary with metadata information
    """
    try:
        parquet_file = pq.ParquetFile(filename)
        metadata = parquet_file.metadata
        schema = parquet_file.schema
        
        info = {
            'num_rows': metadata.num_rows,
            'num_columns': metadata.num_columns,
            'num_row_groups': metadata.num_row_groups,
            'file_size': os.path.getsize(filename),
            'created_by': metadata.created_by,
            'schema': str(schema),
            'columns': []
        }
        
        # Get column information
        for i in range(metadata.num_columns):
            col_meta = metadata.row_group(0).column(i)
            col_info = {
                'name': schema.names[i],
                'type': str(schema.types[i]),
                'total_byte_size': col_meta.total_byte_size,
                'compression': str(col_meta.compression)
            }
            info['columns'].append(col_info)
        
        return info
        
    except Exception as e:
        print(f"Error reading metadata: {e}")
        return {}

# Analyze Parquet file metadata
metadata = get_parquet_metadata('sales_data_snappy.parquet')

if metadata:
    print("Parquet File Metadata:")
    print(f"Rows: {metadata['num_rows']:,}")
    print(f"Columns: {metadata['num_columns']}")
    print(f"Row Groups: {metadata['num_row_groups']}")
    print(f"File Size: {metadata['file_size']:,} bytes")
    print(f"Created By: {metadata['created_by']}")
    
    print("\nColumn Information:")
    for col in metadata['columns']:
        print(f"  {col['name']}: {col['type']} ({col['total_byte_size']:,} bytes, {col['compression']})")

## 6. Performance Comparison

In [None]:
import time

# Performance comparison between CSV and Parquet
def compare_read_performance(csv_file: str, parquet_file: str, iterations: int = 3) -> Dict[str, float]:
    """
    Compare read performance between CSV and Parquet files.
    
    Args:
        csv_file: CSV file path
        parquet_file: Parquet file path
        iterations: Number of iterations for timing
    
    Returns:
        Dictionary with timing results
    """
    results = {'csv_times': [], 'parquet_times': []}
    
    # Test CSV reading
    for i in range(iterations):
        start_time = time.time()
        csv_df = pd.read_csv(csv_file)
        csv_time = time.time() - start_time
        results['csv_times'].append(csv_time)
        print(f"CSV read {i+1}: {csv_time:.3f} seconds")
    
    # Test Parquet reading
    for i in range(iterations):
        start_time = time.time()
        parquet_df = pd.read_parquet(parquet_file)
        parquet_time = time.time() - start_time
        results['parquet_times'].append(parquet_time)
        print(f"Parquet read {i+1}: {parquet_time:.3f} seconds")
    
    # Calculate averages
    avg_csv = sum(results['csv_times']) / len(results['csv_times'])
    avg_parquet = sum(results['parquet_times']) / len(results['parquet_times'])
    
    results['avg_csv'] = avg_csv
    results['avg_parquet'] = avg_parquet
    results['speedup'] = avg_csv / avg_parquet
    
    return results

# Run performance comparison
print("Performance Comparison: CSV vs Parquet")
print("=" * 40)

perf_results = compare_read_performance('sales_data.csv', 'sales_data_snappy.parquet')

print(f"\nResults:")
print(f"Average CSV read time: {perf_results['avg_csv']:.3f} seconds")
print(f"Average Parquet read time: {perf_results['avg_parquet']:.3f} seconds")
print(f"Parquet is {perf_results['speedup']:.1f}x faster than CSV")

# File size comparison
csv_size = os.path.getsize('sales_data.csv')
parquet_size = os.path.getsize('sales_data_snappy.parquet')
size_reduction = (csv_size - parquet_size) / csv_size * 100

print(f"\nFile Size Comparison:")
print(f"CSV file: {csv_size:,} bytes")
print(f"Parquet file: {parquet_size:,} bytes")
print(f"Size reduction: {size_reduction:.1f}%")

## 7. File Format Conversion

In [None]:
# Convert between different file formats
def convert_csv_to_parquet(csv_file: str, parquet_file: str, chunk_size: int = 10000) -> bool:
    """
    Convert CSV file to Parquet format in chunks for memory efficiency.
    
    Args:
        csv_file: Input CSV file
        parquet_file: Output Parquet file
        chunk_size: Number of rows to process at once
    
    Returns:
        True if successful, False otherwise
    """
    try:
        # Read CSV in chunks and write to Parquet
        first_chunk = True
        
        for chunk_df in pd.read_csv(csv_file, chunksize=chunk_size):
            if first_chunk:
                # Write first chunk (creates file)
                chunk_df.to_parquet(parquet_file, index=False)
                first_chunk = False
            else:
                # Append subsequent chunks
                existing_df = pd.read_parquet(parquet_file)
                combined_df = pd.concat([existing_df, chunk_df], ignore_index=True)
                combined_df.to_parquet(parquet_file, index=False)
        
        return True
        
    except Exception as e:
        print(f"Error converting CSV to Parquet: {e}")
        return False

def convert_json_to_parquet(json_file: str, parquet_file: str) -> bool:
    """
    Convert JSON file to Parquet format.
    
    Args:
        json_file: Input JSON file
        parquet_file: Output Parquet file
    
    Returns:
        True if successful, False otherwise
    """
    try:
        # Read JSON and convert to Parquet
        df = pd.read_json(json_file)
        df.to_parquet(parquet_file, index=False)
        return True
        
    except Exception as e:
        print(f"Error converting JSON to Parquet: {e}")
        return False

def convert_parquet_to_csv(parquet_file: str, csv_file: str) -> bool:
    """
    Convert Parquet file to CSV format.
    
    Args:
        parquet_file: Input Parquet file
        csv_file: Output CSV file
    
    Returns:
        True if successful, False otherwise
    """
    try:
        # Read Parquet and convert to CSV
        df = pd.read_parquet(parquet_file)
        df.to_csv(csv_file, index=False)
        return True
        
    except Exception as e:
        print(f"Error converting Parquet to CSV: {e}")
        return False

# Create sample JSON data for conversion testing
sample_json_data = [
    {"id": 1, "name": "Product A", "price": 29.99, "category": "Electronics"},
    {"id": 2, "name": "Product B", "price": 49.99, "category": "Accessories"},
    {"id": 3, "name": "Product C", "price": 19.99, "category": "Electronics"}
]

# Write sample JSON file
import json
with open('sample_products.json', 'w') as f:
    json.dump(sample_json_data, f, indent=2)

# Test conversions
print("File Format Conversions:")
print("=" * 25)

# JSON to Parquet
if convert_json_to_parquet('sample_products.json', 'products.parquet'):
    print("✓ JSON to Parquet conversion successful")
    
    # Verify the conversion
    products_df = pd.read_parquet('products.parquet')
    print(f"  Converted {len(products_df)} products")
    print(f"  Columns: {list(products_df.columns)}")

# Parquet to CSV
if convert_parquet_to_csv('products.parquet', 'products_from_parquet.csv'):
    print("✓ Parquet to CSV conversion successful")
    
    # Show file sizes
    json_size = os.path.getsize('sample_products.json')
    parquet_size = os.path.getsize('products.parquet')
    csv_size = os.path.getsize('products_from_parquet.csv')
    
    print(f"\nFile sizes:")
    print(f"  JSON: {json_size} bytes")
    print(f"  Parquet: {parquet_size} bytes")
    print(f"  CSV: {csv_size} bytes")