In [46]:
# Create sanitized view for data analysis
import os, sys
sys.path.append(os.path.abspath('..'))  # allow importing project root modules

# Import the view creation function
import importlib, create_views
importlib.reload(create_views)
from create_views import create_views_auto

# Connection and base table
conn_str = "mysql+pymysql://root:abhay123@localhost/passionbytes"
base_table_name = "entity_records"  # original table

# Create or replace view with COMPREHENSIVE sanitization
view_suffix = "_view"
created_views = create_views_auto(
    conn_str,
    tables=[base_table_name],
    suffix=view_suffix,
    sanitize_empty_as_null=True  # This now includes comprehensive cleaning
)

# Use the created view for all downstream analysis
your_table_name = f"{base_table_name}{view_suffix}"  # people_data_view
print(f"✅ Created sanitized view: {your_table_name}")
print(f"🧹 Data sanitization includes:")
print(f"   • Removing newlines (\\n), tabs (\\t), carriage returns (\\r)")
print(f"   • Removing percentage signs (%) and other symbols")
print(f"   • Normalizing multiple spaces to single space")
print(f"   • Converting empty strings and null-like tokens to NULL")
print(f"📊 This clean view will be used for all data quality and value distribution analysis")

✅ Created sanitized view: entity_records_view
🧹 Data sanitization includes:
   • Removing newlines (\n), tabs (\t), carriage returns (\r)
   • Removing percentage signs (%) and other symbols
   • Normalizing multiple spaces to single space
   • Converting empty strings and null-like tokens to NULL
📊 This clean view will be used for all data quality and value distribution analysis


In [48]:
from universal_blocking import BlockingFactory
import pandas as pd
import altair as alt
import numpy as np
from sqlalchemy import create_engine

conn_str = "mysql+pymysql://root:abhay123@localhost/passionbytes"
view_name = "entity_records_view"
record_id_col = "RecordID"

# SQL-based blocker
blocker = BlockingFactory.auto_create(
    conn_str=conn_str,
    view_name=view_name,
    record_id_col=record_id_col
)
print("✅ SQLBlocking initialized")


✅ SQLBlocking initialized


In [49]:
# Run all rule counts
stats_df = blocker.run_all_counts()
display(stats_df)

# Auto-select rules (keep between 0.1% and 20% coverage)
selected_rules = stats_df[
    (stats_df["PairsPct"] > 0.1) &
    (stats_df["PairsPct"] < 20)
]["Rule"].tolist()

print("\nAuto-selected rules:", selected_rules)

# Visualize rule coverage
chart = (
    alt.Chart(stats_df)
    .mark_bar()
    .encode(
        x=alt.X("Rule:N", sort="-y"),
        y=alt.Y("PairsPct:Q"),
        tooltip=["Rule", "Pairs", "PairsPct"]
    )
    .properties(width=600, height=400, title="Blocking Rule Coverage")
)
chart.display()


Unnamed: 0,Rule,Pairs,TotalRows,PairsPct
1,Phone,107,138,38.489209
0,Email,86,138,30.935252
2,DOB,85,138,30.57554



Auto-selected rules: []


In [50]:
# Merge candidate pairs from auto-selected rules
pairs_df = blocker.merge_all(rules_to_run=selected_rules, parallel=True)
print("Candidate pairs shape:", pairs_df.shape)
display(pairs_df.head())


Candidate pairs shape: (108, 3)


Unnamed: 0,RecordID1,RecordID2,RulesUsed
0,1,2,Email
1,2,4,Phone
2,6,9,Phone
3,7,8,Phone
4,11,45,Phone


In [51]:
# Use Pandas-based blocker only for clustering
cluster_blocker = BlockingFactory.auto_create(
    df=pairs_df,
    record_id_col="RecordID",
    enhanced=True
)
clusters_df = cluster_blocker.create_clusters(pairs_df, min_cluster_size=2)

print("\n=== Cluster Assignments ===")
display(clusters_df.sort_values("ClusterID"))

# Group records by cluster
cluster_groups = clusters_df.groupby("ClusterID")["RecordID"].apply(list).reset_index()
display(cluster_groups)



=== Cluster Assignments ===


Unnamed: 0,RecordID,ClusterID,ClusterSize
0,1,cluster_0,3
1,2,cluster_0,3
2,4,cluster_0,3
3,9,cluster_1,2
4,6,cluster_1,2
...,...,...,...
26,26,cluster_9,5
22,64,cluster_9,5
23,66,cluster_9,5
24,49,cluster_9,5


Unnamed: 0,ClusterID,RecordID
0,cluster_0,"[1, 2, 4]"
1,cluster_1,"[9, 6]"
2,cluster_10,"[34, 35]"
3,cluster_11,"[61, 46]"
4,cluster_12,"[62, 47]"
5,cluster_13,"[48, 63]"
6,cluster_14,"[65, 50]"
7,cluster_15,"[67, 52]"
8,cluster_16,"[68, 53]"
9,cluster_17,"[69, 54]"


In [60]:
import altair as alt

# --- Step 1: Aggregate counts per Rule + Cluster ---
rule_cluster_counts = (
    merged.groupby(["Rule", "ClusterID"])
    .size()
    .reset_index(name="Count")
)

# --- Step 2: Create grouped bar chart ---
chart = (
    alt.Chart(rule_cluster_counts)
    .mark_bar()
    .encode(
        x=alt.X("Rule:N", title="Blocking Rule"),
        y=alt.Y("Count:Q", title="Record Count"),
        color=alt.Color("ClusterID:N", title="Cluster ID"),
        column=alt.Column("ClusterID:N", title="Cluster ID"),  # group bars by cluster
        tooltip=["Rule", "ClusterID", "Count"]
    )
    .properties(
        width=100,   # width per cluster group
        height=400,
        title="Cluster Distribution by Blocking Rules"
    )
)

chart.display()


In [58]:
print("pairs_df columns:", pairs_df.columns.tolist())
pairs_df.head()


pairs_df columns: ['RecordID1', 'RecordID2', 'RulesUsed']


Unnamed: 0,RecordID1,RecordID2,RulesUsed
0,1,2,Email
1,2,4,Phone
2,6,9,Phone
3,7,8,Phone
4,11,45,Phone


In [65]:
# FIXED VERSION: Create cluster visualizations without the None column error
print("🔧 Creating cluster visualizations (Fixed Version)...")
print("=" * 60)

# Use the existing cluster data we already have from previous cells
if 'clusters_df' in locals() and clusters_df is not None and not clusters_df.empty:
    print("✅ Using existing cluster data for visualization")
    
    # Create cluster size distribution
    cluster_sizes = clusters_df.groupby('ClusterID').size().reset_index(name='Size')
    
    # 1. Cluster Size Distribution Chart
    cluster_size_chart = (
        alt.Chart(cluster_sizes)
        .mark_bar(color='steelblue', opacity=0.8)
        .encode(
            x=alt.X('Size:O', title='Cluster Size (Number of Records)', axis=alt.Axis(labelAngle=0)),
            y=alt.Y('count():Q', title='Number of Clusters'),
            color=alt.Color('count():Q', scale=alt.Scale(scheme='blues'), legend=None),
            tooltip=['Size:O', 'count():Q']
        )
        .properties(
            title={
                "text": "Distribution of Cluster Sizes",
                "subtitle": f"Total Clusters: {len(cluster_sizes)}, Total Records: {len(clusters_df)}",
                "fontSize": 16,
                "subtitleFontSize": 12
            },
            width=500,
            height=350
        )
        .configure_axis(
            labelFontSize=12,
            titleFontSize=14
        )
        .configure_title(
            anchor='start'
        )
    )
    
    print("📊 Cluster Size Distribution Chart:")
    cluster_size_chart
    
    # 2. Top 10 Largest Clusters
    top_clusters = cluster_sizes.nlargest(10, 'Size')
    
    top_clusters_chart = (
        alt.Chart(top_clusters)
        .mark_bar(color='darkgreen', opacity=0.8)
        .encode(
            x=alt.X('Size:Q', title='Number of Records'),
            y=alt.Y('ClusterID:N', title='Cluster ID', sort='-x'),
            color=alt.Color('Size:Q', scale=alt.Scale(scheme='greens'), legend=None),
            tooltip=['ClusterID:N', 'Size:Q']
        )
        .properties(
            title={
                "text": "Top 10 Largest Clusters",
                "subtitle": f"Showing clusters with most similar records",
                "fontSize": 16,
                "subtitleFontSize": 12
            },
            width=600,
            height=400
        )
        .configure_axis(
            labelFontSize=12,
            titleFontSize=14
        )
        .configure_title(
            anchor='start'
        )
    )
    
    print("\\n📊 Top 10 Largest Clusters:")
    top_clusters_chart
    
    # 3. Summary Statistics
    print(f"\\n📈 Cluster Analysis Summary:")
    print(f"  • Total Clusters: {len(cluster_sizes)}")
    print(f"  • Total Records in Clusters: {len(clusters_df)}")
    print(f"  • Average Cluster Size: {cluster_sizes['Size'].mean():.1f} records")
    print(f"  • Largest Cluster: {cluster_sizes['Size'].max()} records")
    print(f"  • Smallest Cluster: {cluster_sizes['Size'].min()} records")
    
    # Show top 5 clusters
    print(f"\\n🏆 Top 5 Largest Clusters:")
    for i, (_, row) in enumerate(top_clusters.head(5).iterrows()):
        print(f"  {i+1}. {row['ClusterID']}: {row['Size']} records")
        
else:
    print("❌ No cluster data available. Please run the blocking pipeline first.")


🔧 Creating cluster visualizations (Fixed Version)...
✅ Using existing cluster data for visualization
📊 Cluster Size Distribution Chart:
\n📊 Top 10 Largest Clusters:
\n📈 Cluster Analysis Summary:
  • Total Clusters: 36
  • Total Records in Clusters: 101
  • Average Cluster Size: 2.8 records
  • Largest Cluster: 5 records
  • Smallest Cluster: 2 records
\n🏆 Top 5 Largest Clusters:
  1. cluster_9: 5 records
  2. cluster_20: 4 records
  3. cluster_21: 4 records
  4. cluster_22: 4 records
  5. cluster_23: 4 records


In [69]:
# FIXED: Create beautiful cluster visualizations without Altair errors
print("🎨 Creating Beautiful Cluster Visualizations (Fixed Version)...")
print("=" * 60)

# Check what data we have available
print("🔍 Available data:")
if 'clusters_df' in locals() and clusters_df is not None:
    print(f"  ✅ clusters_df: {len(clusters_df)} records")
if 'pairs_df' in locals() and pairs_df is not None:
    print(f"  ✅ pairs_df: {len(pairs_df)} records")
if 'stats_df' in locals() and stats_df is not None:
    print(f"  ✅ stats_df: {len(stats_df)} records")

# Create comprehensive visualizations
if 'clusters_df' in locals() and clusters_df is not None and not clusters_df.empty:
    
    # 1. Cluster Size Distribution (Fixed)
    print("\\n📊 Creating Cluster Size Distribution...")
    
    cluster_sizes = clusters_df.groupby('ClusterID').size().reset_index(name='Size')
    
    # Simple and effective cluster size chart
    cluster_size_chart = (
        alt.Chart(cluster_sizes)
        .mark_bar(color='steelblue', opacity=0.8)
        .encode(
            x=alt.X('Size:O', title='Cluster Size (Number of Records)', axis=alt.Axis(labelAngle=0)),
            y=alt.Y('count():Q', title='Number of Clusters'),
            color=alt.Color('count():Q', scale=alt.Scale(scheme='blues'), legend=None),
            tooltip=['Size:O', 'count():Q']
        )
        .properties(
            title={
                "text": "Distribution of Cluster Sizes",
                "subtitle": f"Total Clusters: {len(cluster_sizes)}, Total Records: {len(clusters_df)}",
                "fontSize": 16,
                "subtitleFontSize": 12
            },
            width=500,
            height=350
        )
        .configure_axis(
            labelFontSize=12,
            titleFontSize=14
        )
        .configure_title(
            anchor='start'
        )
    )
    
    print("📊 Cluster Size Distribution Chart:")
    cluster_size_chart
    
    # 2. Top 10 Largest Clusters
    print("\\n🏆 Creating Top 10 Largest Clusters Chart...")
    
    top_clusters = cluster_sizes.nlargest(10, 'Size')
    
    top_clusters_chart = (
        alt.Chart(top_clusters)
        .mark_bar(color='darkgreen', opacity=0.8)
        .encode(
            x=alt.X('Size:Q', title='Number of Records'),
            y=alt.Y('ClusterID:N', title='Cluster ID', sort='-x'),
            color=alt.Color('Size:Q', scale=alt.Scale(scheme='greens'), legend=None),
            tooltip=['ClusterID:N', 'Size:Q']
        )
        .properties(
            title={
                "text": "Top 10 Largest Clusters",
                "subtitle": f"Showing clusters with most similar records",
                "fontSize": 16,
                "subtitleFontSize": 12
            },
            width=600,
            height=400
        )
        .configure_axis(
            labelFontSize=12,
            titleFontSize=14
        )
        .configure_title(
            anchor='start'
        )
    )
    
    print("🏆 Top 10 Largest Clusters Chart:")
    top_clusters_chart
    
    # 3. Cluster Network Visualization
    print("\\n🕸️ Creating Cluster Network Visualization...")
    
    # Create network data showing connections within clusters
    network_data = []
    for cluster_id in clusters_df['ClusterID'].unique():
        cluster_records = clusters_df[clusters_df['ClusterID'] == cluster_id]['RecordID'].tolist()
        cluster_size = len(cluster_records)
        
        # Create connections between all records in the cluster
        for i, record1 in enumerate(cluster_records):
            for j, record2 in enumerate(cluster_records[i+1:], i+1):
                network_data.append({
                    'ClusterID': cluster_id,
                    'Record1': record1,
                    'Record2': record2,
                    'ClusterSize': cluster_size
                })
    
    if network_data:
        network_df = pd.DataFrame(network_data)
        
        network_chart = (
            alt.Chart(network_df)
            .mark_circle(size=100, opacity=0.7)
            .encode(
                x=alt.X('Record1:O', title='Record ID 1'),
                y=alt.Y('Record2:O', title='Record ID 2'),
                color=alt.Color('ClusterSize:O', scale=alt.Scale(scheme='category20'), 
                              title='Cluster Size'),
                size=alt.Size('ClusterSize:O', scale=alt.Scale(range=[50, 200]), legend=None),
                tooltip=['ClusterID:N', 'Record1:O', 'Record2:O', 'ClusterSize:O']
            )
            .properties(
                title={
                    "text": "Cluster Network - Record Connections",
                    "subtitle": f"Each point represents a connection between two records in the same cluster",
                    "fontSize": 16,
                    "subtitleFontSize": 12
                },
                width=600,
                height=500
            )
            .configure_axis(
                labelFontSize=10,
                titleFontSize=12
            )
            .configure_title(
                anchor='start'
            )
        )
        
        print("🕸️ Cluster Network Visualization:")
        network_chart
    
    # 4. Summary Statistics
    print("\\n📈 Cluster Analysis Summary:")
    print(f"  • Total Clusters: {len(cluster_sizes)}")
    print(f"  • Total Records in Clusters: {len(clusters_df)}")
    print(f"  • Average Cluster Size: {cluster_sizes['Size'].mean():.1f} records")
    print(f"  • Largest Cluster: {cluster_sizes['Size'].max()} records")
    print(f"  • Smallest Cluster: {cluster_sizes['Size'].min()} records")
    
    # Show top 5 clusters
    print(f"\\n🏆 Top 5 Largest Clusters:")
    for i, (_, row) in enumerate(top_clusters.head(5).iterrows()):
        print(f"  {i+1}. {row['ClusterID']}: {row['Size']} records")
    
    
    
else:
    print("❌ No cluster data available for visualization")
    
import pandas as pd
import altair as alt

print("🎨 Interactive Cluster Visualizations (Fixed Drill-Down)...")
print("=" * 60)

if 'clusters_df' in locals() and clusters_df is not None and not clusters_df.empty and 'pairs_df' in locals() and pairs_df is not None:

    # --- Prepare cluster sizes ---
    cluster_sizes = clusters_df.groupby('ClusterID').size().reset_index(name='Size')
    top_clusters = cluster_sizes.nlargest(10, 'Size')

    # --- Expand RulesUsed ---
    def expand_rules(df, id_col):
        out = df[[id_col, "RulesUsed"]].copy()
        out["RulesUsed"] = out["RulesUsed"].apply(
            lambda x: x if isinstance(x, list) else str(x).split(",")
        )
        return out.explode("RulesUsed").rename(columns={id_col: "RecordID", "RulesUsed": "Rule"})

    id1, id2 = "RecordID1", "RecordID2"
    pairs_long = pd.concat([
        expand_rules(pairs_df, id1),
        expand_rules(pairs_df, id2)
    ]).drop_duplicates()

    merged = pairs_long.merge(clusters_df, on="RecordID", how="left")

    # --- Selection ---
    cluster_select = alt.selection_single(fields=['ClusterID'], empty="none")

    # --- Chart 1: Top 10 Largest Clusters ---
    top_clusters_chart = (
        alt.Chart(top_clusters)
        .mark_bar(opacity=0.8)
        .encode(
            x=alt.X('Size:Q', title='Number of Records'),
            y=alt.Y('ClusterID:N', title='Cluster ID', sort='-x'),
            tooltip=['ClusterID:N', 'Size:Q'],
            color=alt.condition(cluster_select, alt.value("orange"), alt.value("darkgreen"))
        )
        .properties(
            title="Top 10 Largest Clusters (Click to Inspect)",
            width=400,
            height=350
        )
        .add_selection(cluster_select)
    )

    # --- Chart 2: Drill-down Records with RulesUsed ---
    cluster_records_chart = (
        alt.Chart(merged)
        .mark_bar()
        .encode(
            x=alt.X('Rule:N', title='Blocking Rule'),
            y=alt.Y('RecordID:N', title='Record ID'),
            color=alt.Color('ClusterID:N', legend=None),
            tooltip=['ClusterID:N', 'RecordID:N', 'Rule:N']
        )
        .transform_filter(cluster_select)
        .properties(
            title="Records & Rules in Selected Cluster",
            width=500,
            height=350
        )
    )

    # --- Dashboard ---
    interactive_dashboard = top_clusters_chart | cluster_records_chart
    display(interactive_dashboard)

else:
    print("❌ clusters_df or pairs_df not available for visualization")



🎨 Creating Beautiful Cluster Visualizations (Fixed Version)...
🔍 Available data:
  ✅ clusters_df: 101 records
  ✅ pairs_df: 108 records
  ✅ stats_df: 3 records
\n📊 Creating Cluster Size Distribution...
📊 Cluster Size Distribution Chart:
\n🏆 Creating Top 10 Largest Clusters Chart...
🏆 Top 10 Largest Clusters Chart:
\n🕸️ Creating Cluster Network Visualization...
🕸️ Cluster Network Visualization:
\n📈 Cluster Analysis Summary:
  • Total Clusters: 36
  • Total Records in Clusters: 101
  • Average Cluster Size: 2.8 records
  • Largest Cluster: 5 records
  • Smallest Cluster: 2 records
\n🏆 Top 5 Largest Clusters:
  1. cluster_9: 5 records
  2. cluster_20: 4 records
  3. cluster_21: 4 records
  4. cluster_22: 4 records
  5. cluster_23: 4 records
🎨 Interactive Cluster Visualizations (Fixed Drill-Down)...


In [68]:
import altair as alt
import pandas as pd

print("🎨 Interactive Cluster Visualizations (with Drill-Down)...")
print("=" * 60)

if 'clusters_df' in locals() and clusters_df is not None and not clusters_df.empty and 'pairs_df' in locals() and pairs_df is not None:

    # --- Prepare cluster sizes ---
    cluster_sizes = clusters_df.groupby('ClusterID').size().reset_index(name='Size')
    top_clusters = cluster_sizes.nlargest(10, 'Size')

    # --- Merge rules into clusters ---
    id1, id2 = "RecordID1", "RecordID2"
    pairs_long = pd.concat([
        pairs_df[[id1, "RulesUsed"]].rename(columns={id1: "RecordID"}),
        pairs_df[[id2, "RulesUsed"]].rename(columns={id2: "RecordID"})
    ]).drop_duplicates()

    merged = pairs_long.merge(clusters_df, on="RecordID", how="left")

    # --- Selection ---
    cluster_select = alt.selection_single(fields=['ClusterID'], empty="none")

    # --- Chart 1: Top 10 Largest Clusters ---
    top_clusters_chart = (
        alt.Chart(top_clusters)
        .mark_bar(opacity=0.8)
        .encode(
            x=alt.X('Size:Q', title='Number of Records'),
            y=alt.Y('ClusterID:N', title='Cluster ID', sort='-x'),
            tooltip=['ClusterID:N', 'Size:Q'],
            color=alt.condition(cluster_select, alt.value("orange"), alt.value("darkgreen"))
        )
        .properties(
            title="Top 10 Largest Clusters (Click to Inspect)",
            width=400,
            height=350
        )
        .add_selection(cluster_select)
    )

    # --- Chart 2: Drill-down Records with RulesUsed ---
    cluster_records_chart = (
        alt.Chart(merged)
        .mark_circle(size=80, opacity=0.7)
        .encode(
            x=alt.X('RulesUsed:N', title='Blocking Rule'),
            y=alt.Y('RecordID:N', title='Record ID'),
            color=alt.Color('ClusterID:N', legend=None),
            tooltip=['ClusterID:N', 'RecordID:N', 'RulesUsed:N']
        )
        .transform_filter(cluster_select)
        .properties(
            title="Records & Rules in Selected Cluster",
            width=500,
            height=350
        )
    )

    # --- Dashboard ---
    interactive_dashboard = top_clusters_chart | cluster_records_chart
    display(interactive_dashboard)

else:
    print("❌ clusters_df or pairs_df not available for visualization")


🎨 Interactive Cluster Visualizations (with Drill-Down)...


In [None]:
# import pandas as pd
# import numpy as np
# import itertools
# import altair as alt
# from sqlalchemy import text
# from universal_blocking import jaro_winkler_similarity, levenshtein_similarity

# # ----------------------------
# # Parameters
# # ----------------------------
# sample_blocks_per_rule = 40
# max_members_per_block = 6
# similarity_threshold = 0.82
# min_sampled_pairs_for_rule = 30
# rand_fn = "RAND()" if blocker.dialect.startswith("mysql") else "RANDOM()"

# # ----------------------------
# # Helpers
# # ----------------------------
# def compute_pair_similarity(row1, row2, attr_map):
#     scores, weight_sum = [], 0
#     # Name similarity
#     if "first_name" in attr_map and "last_name" in attr_map:
#         fn1, fn2 = str(row1.get(attr_map["first_name"],"")), str(row2.get(attr_map["first_name"],""))
#         ln1, ln2 = str(row1.get(attr_map["last_name"],"")), str(row2.get(attr_map["last_name"],""))
#         name_sim = (jaro_winkler_similarity(fn1, fn2) + jaro_winkler_similarity(ln1, ln2))/2
#         scores.append((name_sim, 0.6)); weight_sum += 0.6
#     # Email
#     if "email" in attr_map:
#         e1, e2 = str(row1.get(attr_map["email"],"")).lower(), str(row2.get(attr_map["email"],"")).lower()
#         if e1 and e2:
#             scores.append(((1.0 if e1==e2 else jaro_winkler_similarity(e1,e2)), 0.3)); weight_sum += 0.3
#     # Phone
#     if "phone" in attr_map:
#         p1 = ''.join(ch for ch in str(row1.get(attr_map["phone"],"")) if ch.isdigit())
#         p2 = ''.join(ch for ch in str(row2.get(attr_map["phone"],"")) if ch.isdigit())
#         if p1 and p2:
#             scores.append(((1.0 if p1==p2 else levenshtein_similarity(p1,p2)), 0.3)); weight_sum += 0.3
#     return sum(s*w for s,w in scores)/weight_sum if weight_sum else 0.0

# def sample_blocks(key_expr, limit=sample_blocks_per_rule):
#     q = f"""
#         SELECT {key_expr} as blk, COUNT(*)
#         FROM {blocker.view}
#         WHERE {key_expr} IS NOT NULL
#         GROUP BY 1
#         ORDER BY {rand_fn}
#         LIMIT :limit
#     """
#     with blocker.engine.connect() as conn:
#         rows = conn.execute(text(q), {"limit": limit}).fetchall()
#     return [r[0] for r in rows]

# def fetch_block(key_expr, blk_value):
#     q = f"SELECT * FROM {blocker.view} WHERE {key_expr} = :blk LIMIT :lim"
#     with blocker.engine.connect() as conn:
#         df = pd.read_sql(text(q), conn, params={"blk": blk_value, "lim": max_members_per_block})
#     return df

# # ----------------------------
# # Rule scoring
# # ----------------------------
# stats_df = blocker.run_all_counts()
# attr_map = getattr(blocker, "attr_map", {})

# # Build rule → key expression mapping (FIXED: Filter out None values)
# rules_with_keys = {}
# for attr in attr_map.keys():
#     try:
#         expr = blocker._rule_key_expr(attr)  # e.g. LOWER(TRIM(email))
#         # Only include rules with valid expressions (not None)
#         if expr is not None and expr != 'None' and expr != 'null':
#             rules_with_keys[attr] = expr
#     except Exception:
#         continue

# print("Available key expressions:", rules_with_keys)

# # Additional safety check - remove any None values
# rules_with_keys = {k: v for k, v in rules_with_keys.items() if v is not None and v != 'None' and v != 'null'}
# print("Filtered key expressions:", rules_with_keys)

# # Score each rule
# rule_scores = []
# for rule, key_expr in rules_with_keys.items():
#     sims = []
#     for blk in sample_blocks(key_expr):
#         members = fetch_block(key_expr, blk)
#         if len(members) < 2: continue
#         rows = members.to_dict(orient="records")
#         for a, b in itertools.combinations(rows, 2):
#             sims.append(compute_pair_similarity(a, b, attr_map))
#         if len(sims) > 500: break
#     if len(sims) < min_sampled_pairs_for_rule: continue
#     prec_proxy = np.mean([s >= similarity_threshold for s in sims])
#     pairs = int(stats_df.loc[stats_df['Rule']==rule,'Pairs'].iloc[0]) if rule in stats_df['Rule'].values else 0
#     rule_scores.append((rule, prec_proxy, pairs))

# scores_df = pd.DataFrame(rule_scores, columns=["Rule","PrecisionProxy","Pairs"]).sort_values("PrecisionProxy", ascending=False)
# print("Rule scoring:\n", scores_df)

# # ----------------------------
# # Select rules (with safeguard: at least 3)
# # ----------------------------
# selected_rules = scores_df[scores_df["PrecisionProxy"] > 0.5]["Rule"].tolist()
# if len(selected_rules) < 3:
#     selected_rules = scores_df.head(3)["Rule"].tolist()

# print("Selected rules:", selected_rules)

# # ----------------------------
# # Materialize candidate pairs & cluster
# # ----------------------------
# pairs_df = blocker.merge_all(rules_to_run=selected_rules, parallel=True)
# print("Pairs shape:", pairs_df.shape)

# cluster_blocker = BlockingFactory.auto_create(df=pairs_df, record_id_col="RecordID1", enhanced=True)
# clusters_df = cluster_blocker.create_clusters(
#     pairs_df.rename(columns={"RecordID1":"RecordID1","RecordID2":"RecordID2","RulesUsed":"RulesUsed"}),
#     min_cluster_size=2
# )
# print("Clusters:", clusters_df["ClusterID"].nunique())

# # ----------------------------
# # Visualization: Grouped Bar Chart
# # ----------------------------
# id1, id2 = "RecordID1","RecordID2"
# pairs_long = pd.concat([
#     pairs_df[[id1,"RulesUsed"]].rename(columns={id1:"RecordID", "RulesUsed":"Rule"}),
#     pairs_df[[id2,"RulesUsed"]].rename(columns={id2:"RecordID", "RulesUsed":"Rule"})
# ]).drop_duplicates()

# merged = pairs_long.merge(clusters_df, on="RecordID", how="left")
# rule_cluster_counts = merged.groupby(["Rule","ClusterID"]).size().reset_index(name="Count")

# chart = (
#     alt.Chart(rule_cluster_counts)
#     .mark_bar()
#     .encode(
#         x=alt.X("Rule:N", title="Blocking Rule"),
#         y=alt.Y("Count:Q", title="Record Count"),
#         color="ClusterID:N",
#         column="ClusterID:N",
#         tooltip=["Rule","ClusterID","Count"]
#     )
#     .properties(width=100, height=400, title="Cluster Distribution by Blocking Rules")
# )
# chart.display()


Available key expressions: {'last_name': None, 'dob': '`date_of_birth`', 'email': 'LOWER(TRIM(`email`))', 'phone': "REPLACE(REPLACE(REPLACE(TRIM(`phone`),' ',''),'-',''),'(', '')", 'city': None, 'street': None}


OperationalError: (pymysql.err.OperationalError) (1054, "Unknown column 'None' in 'field list'")
[SQL: 
        SELECT None as blk, COUNT(*)
        FROM entity_records_view
        WHERE None IS NOT NULL
        GROUP BY 1
        ORDER BY RAND()
        LIMIT %(limit)s
    ]
[parameters: {'limit': 40}]
(Background on this error at: https://sqlalche.me/e/20/e3q8)