# Polars Proc Compare - Missing Values Demo

This notebook demonstrates comparing datasets with missing values using polars_proc_compare.

In [1]:
import polars as pl
import numpy as np
from pathlib import Path
from polars_proc_compare import DataCompare
from polars_proc_compare.data_generator import create_delta_dataset

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate base dataset
n_rows = 5000
n_cols = 500

# Create a key column (no missing values)
base_data = {"id": range(1, n_rows + 1)}

# Generate random data for other columns with missing values
for i in range(1, n_cols):
    # Generate random values
    values = np.random.normal(loc=100, scale=20, size=n_rows)
    
    # Randomly set some values to None (80% chance of missing for each column)
    mask = np.random.random(n_rows) < 0.8
    values[mask] = None
    
    base_data[f"col_{i}"] = values

# Create base DataFrame
base_df = pl.DataFrame(base_data)

print(f"Base DataFrame shape: {base_df.shape}")
print("\nNull counts for first 5 columns:")
print(base_df.select([pl.col(col).null_count() for col in base_df.columns[:5]]))

Base DataFrame shape: (5000, 500)

Null counts for first 5 columns:
shape: (1, 5)
┌─────┬───────┬───────┬───────┬───────┐
│ id  ┆ col_1 ┆ col_2 ┆ col_3 ┆ col_4 │
│ --- ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ u32 ┆ u32   ┆ u32   ┆ u32   ┆ u32   │
╞═════╪═══════╪═══════╪═══════╪═══════╡
│ 0   ┆ 0     ┆ 0     ┆ 0     ┆ 0     │
└─────┴───────┴───────┴───────┴───────┘


In [3]:
# Create comparison dataset with 5% differences
compare_df, modifications = create_delta_dataset(
    base_df,
    delta_percentage=5.0,
    seed=42,
    exclude_columns=["id"]  # Don't modify the key column
)

# Print summary statistics instead of full dictionary
print("Modification Statistics:")
print(f"Total rows: {modifications['total_rows']}")
print(f"Total columns: {modifications['total_columns']}")
print(f"Modified cells: {modifications['modified_cells']}")
print(f"Modified rows: {modifications['modified_rows']}")
print(f"Number of modified columns: {len(modifications['modified_columns'])}")
# Only print first few modified columns as example
sample_cols = list(modifications['modified_columns'].items())[:5]
print(f"Sample modified columns: {dict(sample_cols)} ...")

print("\nNull counts in compare data (first 5 columns):")
print(compare_df.select([pl.col(col).null_count() for col in compare_df.columns[:5]]))

Modification Statistics:
Total rows: 5000
Total columns: 499
Modified cells: 124750
Modified rows: 5000
Number of modified columns: 499
Sample modified columns: {'col_1': 250, 'col_2': 250, 'col_3': 250, 'col_4': 250, 'col_5': 250} ...

Null counts in compare data (first 5 columns):
shape: (1, 5)
┌─────┬───────┬───────┬───────┬───────┐
│ id  ┆ col_1 ┆ col_2 ┆ col_3 ┆ col_4 │
│ --- ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ u32 ┆ u32   ┆ u32   ┆ u32   ┆ u32   │
╞═════╪═══════╪═══════╪═══════╪═══════╡
│ 0   ┆ 0     ┆ 0     ┆ 0     ┆ 0     │
└─────┴───────┴───────┴───────┴───────┘


In [4]:
# Create results directory if it doesn't exist
results_dir = Path("data")
results_dir.mkdir(exist_ok=True)

# Run comparison
dc = DataCompare(base_df, compare_df, key_columns=["id"])
results = dc.compare()

# Generate reports
results.to_html(results_dir / "missing_values_comparison.html")
results.to_csv(results_dir / "missing_values_differences.csv")

print("\nComparison completed. Check the following files for results:")
print("- data/missing_values_comparison.html")
print("- data/missing_values_differences.csv")


Comparison completed. Check the following files for results:
- data/missing_values_comparison.html
- data/missing_values_differences.csv
