In [1]:
import os, sys
sys.path.append(os.path.abspath('..'))  # allow importing project root modules

# Import the view creation function
import importlib, create_views
importlib.reload(create_views)
from create_views import create_views_auto

# Connection and base table
conn_str = "mysql+pymysql://root:panoiq@65.1.6.222:30992/ap"
base_table_name = "people_data"  # original table

# Create or replace view with COMPREHENSIVE sanitization
view_suffix = "_view"
created_views = create_views_auto(
    conn_str,
    tables=[base_table_name],
    suffix=view_suffix,
    sanitize_empty_as_null=True  # comprehensive cleaning
)

# Use the created view for all downstream analysis
view_name = f"{base_table_name}{view_suffix}"  # people_data_view
print(f"✅ Created sanitized view: {view_name}")


✅ Created sanitized view: people_data_view


In [2]:
from universal_blocking import BlockingFactory
import pandas as pd

record_id_col = "RecordID"

# SQL-based blocker (works with MySQL connection)
blocker = BlockingFactory.auto_create(
    conn_str=conn_str,
    view_name=view_name,
    record_id_col=record_id_col
)
print("✅ SQLBlocking initialized")

# Run all rule counts
stats_df = blocker.run_all_counts()
display(stats_df)

# Auto-select rules (example thresholds)
selected_rules = stats_df[
    (stats_df["PairsPct"] > 0.1) &
    (stats_df["PairsPct"] < 20)
]["Rule"].tolist()

print("\nAuto-selected rules:", selected_rules)


✅ SQLBlocking initialized


Unnamed: 0,Rule,Pairs,TotalRows,PairsPct
4,DOB,160595642,50578,80.844244
2,Prefix2Name,37964123,50578,19.111234
3,Last_FirstChar,88068,50578,0.044334
1,Phone,368,50578,0.000185
0,Email,6,50578,3e-06



Auto-selected rules: ['Prefix2Name']


In [3]:
# Generate candidate pairs with fast SQL pushdown
try:
    pairs_df = blocker.merge_all_sql(selected_rules)
    print("Candidate pairs shape:", pairs_df.shape)
    display(pairs_df.head())
except Exception as e:
    print(f"❌ Error generating candidate pairs: {e}")
    pairs_df = pd.DataFrame(columns=["RecordID_l", "RecordID_r"])  # Fallback empty DataFrame

🚀 Running merge_all_sql query...
SELECT r1.`RecordID`, r2.`RecordID` FROM people_data_view r1 JOIN people_data_view r2 ON SUBSTR(LOWER(TRIM(r1.`first_and_surname`)),1,2)=SUBSTR(LOWER(TRIM(r2.`first_and_surname`)),1,2) AND SUBSTR(LOWER(TRIM(r1.`full_name`)),1,2)=SUBSTR(LOWER(TRIM(r2.`full_name`)),1,2) WHERE r1.`RecordID`<r2.`RecordID`
Candidate pairs shape: (37964123, 2)


Unnamed: 0,RecordID,RecordID.1
0,2,3
1,3,4
2,2,4
3,4,5
4,3,5


In [4]:
# Use the existing pairs_df from cell [3] (blocks/candidate pairs already created)
# 🔧 Fix column names for clarity (even without clustering)
pairs_df = pairs_df.rename(
    columns={"RecordID_l": "RecordID1", "RecordID_r": "RecordID2"}
)

print("Candidate pairs (blocks) shape:", pairs_df.shape)
display(pairs_df.head())

# Optional: Sample if too large (e.g., for inspection)
# pairs_sample = pairs_df.sample(min(10000, len(pairs_df)))
# print("\nSample of 10k candidate pairs:")
# display(pairs_sample)

# Stop here: Blocks created (no clustering as requested)
print("\n✅ Blocks (candidate pairs) created successfully. No clustering performed.")

Candidate pairs (blocks) shape: (37964123, 2)


Unnamed: 0,RecordID,RecordID.1
0,2,3
1,3,4
2,2,4
3,4,5
4,3,5



✅ Blocks (candidate pairs) created successfully. No clustering performed.


In [5]:
# Use the existing pairs_df from cell [3] (blocks/candidate pairs already created)
# 🔧 Fix column names for clarity
pairs_df = pairs_df.rename(
    columns={"RecordID_l": "RecordID1", "RecordID_r": "RecordID2"}
)

# Display shape and sample for verification
print("Candidate pairs (blocks) shape:", pairs_df.shape)
display(pairs_df.head())

# Save blocks to CSV for Fellegi-Sunter model
output_path = "./candidate_pairs.csv"
try:
    pairs_df.to_csv(output_path, index=False)
    print(f"\n✅ Blocks saved successfully to {output_path} for Fellegi-Sunter model.")
except Exception as e:
    print(f"❌ Error saving blocks to {output_path}: {e}")

# Optional: Save a sample for quick inspection (e.g., 10k rows)
sample_path = "./candidate_pairs_sample.csv"
try:
    pairs_sample = pairs_df.sample(min(10000, len(pairs_df)), random_state=42)
    pairs_sample.to_csv(sample_path, index=False)
    print(f"✅ Sample of 10k candidate pairs saved to {sample_path} for inspection.")
except Exception as e:
    print(f"❌ Error saving sample to {sample_path}: {e}")

# Stop here: Blocks created and saved (no clustering as requested)
print("\n✅ Blocks (candidate pairs) created and saved successfully. Ready for Fellegi-Sunter model.")

Candidate pairs (blocks) shape: (37964123, 2)


Unnamed: 0,RecordID,RecordID.1
0,2,3
1,3,4
2,2,4
3,4,5
4,3,5



✅ Blocks saved successfully to ./candidate_pairs.csv for Fellegi-Sunter model.
✅ Sample of 10k candidate pairs saved to ./candidate_pairs_sample.csv for inspection.

✅ Blocks (candidate pairs) created and saved successfully. Ready for Fellegi-Sunter model.
