In [None]:
import pandas as pd
import polars as pl
import logging
from sqlalchemy import create_engine
from universal_blocking import normalize_date, normalize_time, BlockingFactory
import altair as alt

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

conn_str = "trino://root@3.108.199.0:32092/mysql/ap"
engine = create_engine(conn_str, connect_args={'http_scheme': 'http'})

# Load data from the database
df_pandas = pd.read_sql("SELECT * FROM mysql.ap.people_data LIMIT 1000000", engine)

In [3]:
# Add a unique RecordID for tracking
df_pandas['RecordID'] = range(1, len(df_pandas) + 1)

# Clean string columns by removing unwanted characters and stripping whitespace
string_cols = df_pandas.select_dtypes(include=['object']).columns
df_pandas[string_cols] = df_pandas[string_cols].astype(str).replace(['', 'nan', 'None'], None).apply(lambda x: x.str.strip().str.replace(r'[\\n\\r\\t%]', ' ', regex=True))

# Reduce multiple spaces to a single space in all string columns
df_pandas[string_cols] = df_pandas[string_cols].apply(lambda x: x.str.replace(r' +', ' ', regex=True))

# Normalize date and time columns if they exist
if 'dob' in df_pandas:
    df_pandas['dob'] = df_pandas['dob'].apply(normalize_date)
if 'time' in df_pandas:
    df_pandas['time'] = df_pandas['time'].apply(normalize_time)

# Convert to Polars DataFrame for high-performance operations
df = pl.from_pandas(df_pandas)

# Display the first 5 rows of the processed data
print("Sample:\n", df.head(5))

# Dispose of the database engine connection
engine.dispose()

Sample:
 shape: (5, 16)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ full_name ┆ first_and ┆ first_nam ┆ surname   ┆ … ┆ city      ┆ country   ┆ postal_co ┆ RecordID │
│ ---       ┆ _surname  ┆ e         ┆ ---       ┆   ┆ ---       ┆ ---       ┆ de        ┆ ---      │
│ str       ┆ ---       ┆ ---       ┆ str       ┆   ┆ str       ┆ str       ┆ ---       ┆ i64      │
│           ┆ str       ┆ str       ┆           ┆   ┆           ┆           ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ a hu e    ┆ a hu e    ┆ a hu      ┆ e         ┆ … ┆ STOCKTON- ┆ Uzbekis a ┆ s190sh    ┆ 1        │
│ glefield  ┆ glefield  ┆           ┆ glefield  ┆   ┆ ON-TEES   ┆           ┆           ┆          │
│ a         ┆ a         ┆ a         ┆ null      ┆ … ┆ STOCKTON- ┆ Chi a     ┆ s5 7hb    ┆ 2        │
│           ┆           ┆           ┆           ┆   ┆ ON-TEES   ┆  

In [4]:
# Automatically create a blocker with rules based on the data
blocker = BlockingFactory.auto_create(df=df, conn_str=conn_str, view_name="mysql.ap.people_data", record_id_col="RecordID")

# Run all blocking rules in parallel for efficiency
per_rule_dfs = blocker.run_all(parallel=True, max_workers=8)

# Generate and display a report on the performance of each rule
stats_df, chart = blocker.generate_rule_report(per_rule_dfs, save_html_path="rule_report.html")

if stats_df is not None:
    print("Rule Report:\\n", stats_df)
    chart.display()
else:
    print("No pairs found.")

AttributeError: 'TrinoBlocking' object has no attribute 'attr_map'

In [None]:
# Merge pairs from all rules into a single DataFrame
merged_pairs = blocker.merge_all()

if not merged_pairs.is_empty():
    # Save the identified pairs to a CSV file
    merged_pairs.write("pairs.csv")

    # Count the number of pairs generated by each rule
    counts = merged_pairs.group_by("RulesUsed").len().sort("len", descending=True)

    # Create a bar chart to visualize the results
    chart = alt.Chart(counts.to_pandas()).mark_bar().encode(
        x=alt.X("RulesUsed:N", sort="-y", title="Blocking Rule"),
        y=alt.Y("len:Q", title="Unique Pairs"),
        tooltip=["RulesUsed:N", alt.Tooltip("len:Q", format=",.0f", title="Pairs")],
        color=alt.Color("len:Q", scale=alt.Scale(scheme="blues"))
    ).properties(title="Unique Pairs by Rule", width=600, height=400)

    # Save the chart to an HTML file and display it
    chart.save("pairs_chart.html")
    chart.display()
    print(f"Saved {len(merged_pairs)} pairs to pairs.csv")
else:
    print("No pairs to save.")