# Polars Proc Compare Example

This notebook demonstrates how to use the Polars Proc Compare library to compare datasets.

In [1]:
import polars as pl
from polars_proc_compare import DataCompare
from polars_proc_compare.data_generator import create_delta_dataset

## Create Sample Datasets

First, let's create a sample dataset and a modified version with known differences.

In [2]:
# Create base dataset
base_df = pl.DataFrame({
    "id": range(1, 1001),
    "name": [f"name_{i}" for i in range(1, 1001)],
    "value": [float(i) for i in range(1, 1001)],
    "category": ["A" if i % 2 == 0 else "B" for i in range(1, 1001)]
})

# Create comparison dataset with 5% differences
compare_df, modifications = create_delta_dataset(
    base_df,
    delta_percentage=5.0,
    seed=42,
    exclude_columns=["id"]
)

print("Modification Statistics:")
for key, value in modifications.items():
    print(f"{key}: {value}")

Modification Statistics:
total_rows: 1000
total_columns: 3
total_cells: 3000
modified_cells: 150
modified_columns: {'value': 43, 'category': 58, 'name': 49}
modified_rows: 140


## Compare Datasets

Now let's compare the datasets using DataCompare.

In [3]:
# Create comparison object
dc = DataCompare(base_df, compare_df, key_columns=["id"])

# Run comparison
results = dc.compare()

# Generate HTML report
results.to_html("comparison_report.html")

# Export differences to CSV
results.to_csv("differences.csv")

# Display structure comparison results
print("\nStructure Comparison:")
for key, value in results.structure_results.items():
    print(f"{key}: {value}")

# Display statistics
print("\nColumn Statistics:")
for col, stats in results.statistics.items():
    print(f"\n{col}:")
    for stat_name, stat_value in stats.items():
        print(f"  {stat_name}: {stat_value}")


Structure Comparison:
common_cols: ['id', 'category', 'name', 'value']
base_only: []
compare_only: []
base_schema: OrderedDict({'id': Int64, 'name': Utf8, 'value': Float64, 'category': Utf8})
compare_schema: OrderedDict({'id': Int64, 'name': Utf8, 'value': Float64, 'category': Utf8})
base_nrows: 1000
compare_nrows: 1000
base_ncols: 4
compare_ncols: 4
variable_types: {'id': 'Int64', 'category': 'Utf8', 'name': 'Utf8', 'value': 'Float64'}
matched_rows: 1000
base_only_rows: 0
compare_only_rows: 0

Column Statistics:

category:
  n_differences: 54
  first_n_differences: [{'obs': 1, 'base': 'B', 'compare': 'B_modified'}, {'obs': 2, 'base': 'B', 'compare': 'B_modified'}, {'obs': 3, 'base': 'B', 'compare': 'B_modified'}, {'obs': 4, 'base': 'B', 'compare': 'B_modified'}, {'obs': 5, 'base': 'A', 'compare': 'A_modified'}, {'obs': 6, 'base': 'B', 'compare': 'B_modified'}, {'obs': 7, 'base': 'A', 'compare': 'A_modified'}, {'obs': 8, 'base': 'B', 'compare': 'B_modified'}, {'obs': 9, 'base': 'B', '