Testing if we can do a full implementation of some functions with only pyarrow.

See https://github.com/apache/arrow/issues/30950 for drop_duplicates.

In [None]:
import pyarrow as pa
import pandas as pd
import numpy as np
import time
import psutil
import os
from typing import Callable
import matplotlib.pyplot as plt
import pyarrow.compute as pc
from datetime import datetime, timedelta

def create_test_parquet(filename: str, num_rows: int) -> None:
    """Create a test parquet file with synthetic data."""
    np.random.seed(42)
    
    # Generate random dates within a 5-year range
    base_date = datetime(2020, 1, 1)
    random_days = np.random.randint(0, 365 * 5, size=num_rows)
    dates = [base_date + timedelta(days=int(days)) for days in random_days]
    
    df = pd.DataFrame({
        'person_id': np.random.randint(1, num_rows // 10, size=num_rows),
        'start_date': dates,
        'end_date': [d + timedelta(days=np.random.randint(0, 365)) for d in dates],
        'type_concept': ['test_type'] * num_rows
    })
    
    df.to_parquet(filename)

def measure_performance(func: Callable, filename: str) -> tuple[float, float]:
    """Measure execution time and peak memory usage of a function."""
    # Get the current process
    process = psutil.Process()
    
    # Record starting memory
    start_memory = process.memory_info().rss / 1024 / 1024  # MB
    
    # Time the function
    start_time = time.time()
    result = func(filename)
    execution_time = time.time() - start_time
    
    # Record peak memory
    end_memory = process.memory_info().rss / 1024 / 1024  # MB
    memory_used = end_memory - start_memory
    
    return result, execution_time, memory_used

def run_comparison_tests(sizes: list[int]) -> tuple[dict, dict]:
    """Run performance tests for both implementations with different data sizes."""
    pandas_times = []
    pandas_memory = []
    pyarrow_times = []
    pyarrow_memory = []
    
    for size in sizes:
        test_file = f'test_data_{size}.parquet'
        
        # Create test data
        print(f"\nTesting with {size} rows...")
        create_test_parquet(test_file, size)
        
        # Test pandas implementation
        pandas_result, pandas_time, pandas_mem = measure_performance(pandas_implementation, test_file)
        pandas_times.append(pandas_time)
        pandas_memory.append(pandas_mem)
        print(f"Pandas: {pandas_time:.2f}s, {pandas_mem:.2f}MB")
        
        # Test pyarrow implementation
        pyarrow_result, pyarrow_time, pyarrow_mem = measure_performance(pyarrow_implementation, test_file)
        pyarrow_times.append(pyarrow_time)
        pyarrow_memory.append(pyarrow_mem)
        print(f"PyArrow: {pyarrow_time:.2f}s, {pyarrow_mem:.2f}MB")
        
        # Make sure they are the same
        # print(pandas_result.to_pandas().info())
        # print(pyarrow_result.to_pandas().info())
        try:
            pd.testing.assert_frame_equal(pandas_result.to_pandas(),
                                        pyarrow_result.to_pandas())
        except:
            print("Not equal!")
        
        # Clean up test file
        os.remove(test_file)
    
    return {
        'sizes': sizes,
        'pandas': {'time': pandas_times, 'memory': pandas_memory},
        'pyarrow': {'time': pyarrow_times, 'memory': pyarrow_memory}
    }

def plot_results(results: dict) -> None:
    """Plot performance comparison results."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    sizes = results['sizes']
    x_ticks = [f"{size/1000:.0f}k" for size in sizes]
    
    # Time comparison
    ax1.plot(x_ticks, results['pandas']['time'], marker='o', label='Pandas')
    ax1.plot(x_ticks, results['pyarrow']['time'], marker='o', label='PyArrow')
    ax1.set_title('Execution Time Comparison')
    ax1.set_xlabel('Dataset Size (rows)')
    ax1.set_ylabel('Time (seconds)')
    ax1.legend()
    ax1.grid(True)
    
    # Memory comparison
    ax2.plot(x_ticks, results['pandas']['memory'], marker='o', label='Pandas')
    ax2.plot(x_ticks, results['pyarrow']['memory'], marker='o', label='PyArrow')
    ax2.set_title('Memory Usage Comparison')
    ax2.set_xlabel('Dataset Size (rows)')
    ax2.set_ylabel('Memory (MB)')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

# Copy both implementations here
def pandas_implementation(filename: str) -> pa.Table:
    """Original pandas-based implementation."""
    df_events = pd.read_parquet(
        filename,
        columns=["person_id", "start_date", "end_date", "type_concept"]
    )
    
    event_type = df_events["type_concept"].iloc[0]
    
    df_melted = (df_events[["person_id", "start_date", "end_date"]]
                 .melt(id_vars=["person_id"], value_name="event_date")
                 .dropna()
                 [["person_id", "event_date"]])
    
    df_output = pd.DataFrame({
        "person_id": df_melted["person_id"],
        "start_date": df_melted["event_date"],
        "end_date": df_melted["event_date"],
        "type_concept": event_type
    })
    
    return pa.Table.from_pandas(
        df_output.drop_duplicates(ignore_index=True),
        preserve_index=False
    )

def pyarrow_implementation(filename: str) -> pa.Table:
    """Pure PyArrow-based implementation."""
    table = pa.parquet.read_table(
        filename,
        columns=['person_id', 'start_date', 'end_date', 'type_concept']
    )
    
    type_concept = table['type_concept'][0].as_py()
    
    start_dates = table.select(['person_id', 'start_date'])
    end_dates = table.select(['person_id', 'end_date'])
    end_dates = end_dates.rename_columns(['person_id', 'start_date'])
    
    combined = pa.concat_tables([start_dates, end_dates])
    mask = pc.is_valid(combined['start_date'])
    combined = combined.filter(mask)
    
    combined = combined.append_column('end_date',combined['start_date'])
    combined = combined.append_column('type_concept',pa.array([type_concept] * len(combined)))

    # Remove duplicates by aggregating
    combined = combined.group_by(['person_id', 'start_date', 'end_date', 'type_concept']).aggregate([])
    # combined = pc.unique(combined)

    return combined


In [None]:
# Test with different dataset sizes
sizes = [1000, 10000, 100000, 1000000]
results = run_comparison_tests(sizes)
plot_results(results)

In [None]:
test_file = 'test.parquet'
create_test_parquet(test_file, 20)
pandas_result, pandas_time, pandas_mem = measure_performance(pandas_implementation, test_file)
pandas_result.to_pandas().info()
pyarrow_result, pyarrow_time, pyarrow_mem = measure_performance(pyarrow_implementation, test_file)
pyarrow_result.to_pandas().info()
os.remove(test_file)

In [None]:
pd.testing.assert_frame_equal(pandas_result.to_pandas(),
                            pyarrow_result.to_pandas())