# Polars Proc Compare Example

This notebook demonstrates how to use the Polars Proc Compare library to compare datasets.

In [None]:
import polars as pl
from polars_proc_compare import DataCompare
from polars_proc_compare.data_generator import create_delta_dataset

## Create Sample Datasets

First, let's create a sample dataset and a modified version with known differences.

In [None]:
# Create base dataset
base_df = pl.DataFrame({
    "id": range(1, 1001),
    "name": [f"name_{i}" for i in range(1, 1001)],
    "value": [float(i) for i in range(1, 1001)],
    "category": ["A" if i % 2 == 0 else "B" for i in range(1, 1001)]
})

# Create comparison dataset with 5% differences
compare_df, modifications = create_delta_dataset(
    base_df,
    delta_percentage=5.0,
    seed=42,
    exclude_columns=["id"]
)

print("Modification Statistics:")
for key, value in modifications.items():
    print(f"{key}: {value}")

## Compare Datasets

Now let's compare the datasets using DataCompare.

In [None]:
# Create comparison object
dc = DataCompare(base_df, compare_df, key_columns=["id"])

# Run comparison
results = dc.compare()

# Generate HTML report
results.to_html("comparison_report.html")

# Export differences to CSV
results.to_csv("differences.csv")

# Display structure comparison results
print("\nStructure Comparison:")
for key, value in results.structure_results.items():
    print(f"{key}: {value}")

# Display statistics
print("\nColumn Statistics:")
for col, stats in results.statistics.items():
    print(f"\n{col}:")
    for stat_name, stat_value in stats.items():
        print(f"  {stat_name}: {stat_value}")