In [1]:
import os
import pandas as pd
import polars as pl
import logging
from sqlalchemy import create_engine, text
import trino
import gc

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Trino Connection Configuration (dynamic for multiple catalogs)
TRINO_HOST = "3.108.199.0"
TRINO_PORT = 32092
TRINO_USER = "root"
TRINO_CATALOG = "hive"  # Change to hive, mongo, pinot, postgresql as needed
TRINO_SCHEMA = "default"      # Update per catalog (e.g., default, db, public)
TRINO_TABLE = "employees"  # Update per catalog (e.g., collection for mongo)

# Construct SQLAlchemy connection string
conn_str = f"trino://{TRINO_USER}@{TRINO_HOST}:{TRINO_PORT}/{TRINO_CATALOG}/{TRINO_SCHEMA}"

def get_trino_connection():
    """Create a SQLAlchemy engine for Trino."""
    try:
        engine = create_engine(
            conn_str,
            connect_args={'http_scheme': 'http', 'auth': None}  # Use 'https' if SSL enabled
        )
        logging.info(f"Connected to Trino with catalog {TRINO_CATALOG}")
        return engine
    except Exception as e:
        logging.error(f"Failed to create Trino engine: {str(e)}")
        raise

# Simplified query to fetch raw data (no SQL sanitization to avoid Trino issues)
logging.info(f"Fetching raw data from {TRINO_CATALOG}.{TRINO_SCHEMA}.{TRINO_TABLE}")
base_query = f"""
SELECT *
FROM {TRINO_CATALOG}.{TRINO_SCHEMA}.{TRINO_TABLE}
LIMIT 1000000  -- Remove for full dataset; keep for testing
"""

try:
    # Primary attempt: Use Pandas + SQLAlchemy for robust fetching
    engine = get_trino_connection()
    # Chunked reading for scalability
    df_pandas_chunks = pd.read_sql(text(base_query), engine, chunksize=100000)
    df_pandas = pd.concat([chunk for chunk in df_pandas_chunks], ignore_index=True)
    logging.info(f"Fetched {len(df_pandas)} rows with Pandas from {TRINO_CATALOG}")
    
    # Add RecordID if not present
    if 'RecordID' not in df_pandas.columns:
        df_pandas['RecordID'] = range(1, len(df_pandas) + 1)
    
    # Sanitize dirty data in Pandas (preserves original table)
    string_cols = df_pandas.select_dtypes(include=['object']).columns
    for col in string_cols:
        if col != 'RecordID':
            df_pandas[col] = df_pandas[col].astype(str).str.strip().str.replace(r'[\n\r\t%]', ' ', regex=True)
            df_pandas[col] = df_pandas[col].replace({'': None, 'nan': None, 'None': None})
    
    # Handle DOB as date if present
    if 'dob' in df_pandas.columns:
        df_pandas['dob'] = pd.to_datetime(df_pandas['dob'], errors='coerce', dayfirst=True)
    
    # Convert to Polars for fast processing
    df = pl.from_pandas(df_pandas)
    logging.info(f"Created temporary Polars DataFrame with {len(df)} rows and columns: {df.columns}")
    
    # Display sample and schema
    print("Temporary DataFrame sample:")
    print(df.head(5))
    print("\nDataFrame schema:")
    print(df.schema)
    
    # Clean up
    engine.dispose()
    del df_pandas
    gc.collect()
    
except Exception as e:
    logging.error(f"Primary fetch failed: {str(e)}")
    # Fallback: Use raw Trino DBAPI
    try:
        conn = trino.dbapi.connect(
            host=TRINO_HOST,
            port=TRINO_PORT,
            user=TRINO_USER,
            catalog=TRINO_CATALOG,
            schema=TRINO_SCHEMA,
            http_scheme='http'  # Use 'https' if SSL enabled
        )
        cur = conn.cursor()
        cur.execute(base_query)
        rows = cur.fetchall()
        columns = [desc[0] for desc in cur.description]
        df_pandas = pd.DataFrame(rows, columns=columns)
        logging.info(f"Fallback fetched {len(df_pandas)} rows with Trino DBAPI")
        
        # Add RecordID and sanitize (same as primary path)
        if 'RecordID' not in df_pandas.columns:
            df_pandas['RecordID'] = range(1, len(df_pandas) + 1)
        string_cols = df_pandas.select_dtypes(include=['object']).columns
        for col in string_cols:
            if col != 'RecordID':
                df_pandas[col] = df_pandas[col].astype(str).str.strip().str.replace(r'[\n\r\t%]', ' ', regex=True)
                df_pandas[col] = df_pandas[col].replace({'': None, 'nan': None, 'None': None})
        if 'dob' in df_pandas.columns:
            df_pandas['dob'] = pd.to_datetime(df_pandas['dob'], errors='coerce', dayfirst=True)
        
        # Convert to Polars
        df = pl.from_pandas(df_pandas)
        logging.info(f"Fallback created Polars DataFrame with {len(df)} rows and columns: {df.columns}")
        
        # Display sample and schema
        print("Temporary DataFrame sample (fallback):")
        print(df.head(5))
        print("\nDataFrame schema (fallback):")
        print(df.schema)
        
        conn.close()
        del df_pandas
        gc.collect()
        
    except Exception as fallback_e:
        logging.error(f"Fallback failed: {str(fallback_e)}")
        raise ValueError(f"Cannot fetch data from {TRINO_CATALOG}. Check connection, permissions, or table existence. Full error: {str(fallback_e)}")

2025-09-08 17:24:21,006 - INFO - Fetching raw data from hive.default.employees
2025-09-08 17:24:21,297 - INFO - Connected to Trino with catalog hive
2025-09-08 17:24:21,731 - INFO - Fetched 52 rows with Pandas from hive
2025-09-08 17:24:21,796 - INFO - Created temporary Polars DataFrame with 52 rows and columns: ['name', 'city', 'salary', 'department', 'age', 'RecordID']


Temporary DataFrame sample:
shape: (5, 6)
┌─────────┬───────────┬────────┬────────────┬─────┬──────────┐
│ name    ┆ city      ┆ salary ┆ department ┆ age ┆ RecordID │
│ ---     ┆ ---       ┆ ---    ┆ ---        ┆ --- ┆ ---      │
│ str     ┆ str       ┆ i64    ┆ str        ┆ i64 ┆ i64      │
╞═════════╪═══════════╪════════╪════════════╪═════╪══════════╡
│ Alice   ┆ Mumbai    ┆ 55000  ┆ IT         ┆ 29  ┆ 1        │
│ Bob     ┆ Delhi     ┆ 62000  ┆ HR         ┆ 35  ┆ 2        │
│ Charlie ┆ Bangalore ┆ 70000  ┆ Finance    ┆ 40  ┆ 3        │
│ Diana   ┆ Hyderabad ┆ 48000  ┆ IT         ┆ 27  ┆ 4        │
│ Ethan   ┆ Chennai   ┆ 51000  ┆ Marketing  ┆ 30  ┆ 5        │
└─────────┴───────────┴────────┴────────────┴─────┴──────────┘

DataFrame schema:
Schema([('name', String), ('city', String), ('salary', Int64), ('department', String), ('age', Int64), ('RecordID', Int64)])


In [2]:
from universal_blocking import BlockingFactory
try:
         blocker = BlockingFactory.auto_create(
             df=df,
             record_id_col="RecordID"
         )
         print("✅ Blocking initialized")
         print(f"Blocking rules available: {list(blocker.rules.keys())}")
except Exception as e:
        logging.error(f"Failed to initialize blocking: {str(e)}")
        raise

2025-09-08 17:24:25,356 - INFO - BlockingFactory.auto_create called with df=True, conn_str=None, view_name=None, collection=None
2025-09-08 17:24:25,372 - INFO - Creating PolarsBlocking
2025-09-08 17:24:25,372 - INFO - Initializing PolarsBlocking with DataFrame columns: ['name', 'city', 'salary', 'department', 'age', 'RecordID']
2025-09-08 17:24:25,372 - INFO - Attribute map: {'customer_name': 'name', 'city': 'city'}
2025-09-08 17:24:25,372 - INFO - Generated rules: []


✅ Blocking initialized
Blocking rules available: []


In [3]:
import logging
import altair as alt
from IPython.display import display

# Ensure logging is configured
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Global variable to store per_rule_dfs for Cell 4
global per_rule_dfs

try:
    logging.info(f"DataFrame columns and types: {dict(zip(df.columns, df.dtypes))}")
    logging.info(f"Attribute map: {blocker.attr_map}")
    
    per_rule_dfs = blocker.run_all(parallel=True, max_workers=8)
    
    if per_rule_dfs:
        stats_df, _ = blocker.generate_rule_report(
            per_rule_dfs=per_rule_dfs,
            show_top_n=10,
            save_html_path=None
        )
        chart = (
            alt.Chart(stats_df)
            .mark_bar()
            .encode(
                x=alt.X("Rule:N", sort="-y", title="Blocking Rule"),
                y=alt.Y("PairsPct:Q", title="Candidate Pairs (%)"),
                tooltip=[
                    alt.Tooltip("Rule:N", title="Rule"),
                    alt.Tooltip("Pairs:Q", format=",.0f", title="Pairs"),
                    alt.Tooltip("PairsPct:Q", format=".2f", title="Pairs (%)"),
                    alt.Tooltip("UniqueRecords:Q", format=",.0f", title="Unique Records"),
                    alt.Tooltip("AvgBlockSize:Q", format=".2f", title="Avg Block Size")
                ],
                color=alt.Color("PairsPct:Q", scale=alt.Scale(scheme="blues"), title="Pairs (%)")
            )
            .properties(
                title="Blocking Rules by Percentage of Candidate Pairs",
                width=600,
                height=400
            )
        )
        print("✅ Blocking rules executed and report generated")
        display(stats_df)
        display(chart)
    else:
        print("No candidate pairs found.")
except Exception as e:
    logging.error(f"Failed to run blocking rules or generate report: {str(e)}")
    raise

2025-09-08 17:24:25,410 - INFO - DataFrame columns and types: {'name': String, 'city': String, 'salary': Int64, 'department': String, 'age': Int64, 'RecordID': Int64}
2025-09-08 17:24:25,411 - INFO - Attribute map: {'customer_name': 'name', 'city': 'city'}
2025-09-08 17:24:25,412 - INFO - Running all rules (parallel=True, max_workers=8)


No candidate pairs found.


In [4]:
import logging
import pandas as pd
import networkx as nx
import altair as alt
from IPython.display import display
import polars as pl
import psutil
import gc

# Ensure logging is configured
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def merge_pairs(per_rule_dfs: dict) -> pl.DataFrame:
    """Merge candidate pairs from per_rule_dfs, avoiding re-running rules."""
    if not per_rule_dfs:
        logging.info("No pairs to merge")
        return pl.DataFrame(schema=["RecordID1", "RecordID2", "RulesUsed"])
    logging.info("Merging candidate pairs")
    combined = pl.concat(list(per_rule_dfs.values()))
    agg = combined.group_by(["RecordID1", "RecordID2"]).agg(
        RulesUsed=pl.col("Rule").unique().sort().str.join(",")
    )
    logging.info(f"Merged {len(agg)} unique pairs")
    return agg

try:
    # Log memory usage
    process = psutil.Process()
    logging.info(f"Memory before merging: {process.memory_info().rss / 1024**2:.2f} MB")
    
    # Reuse per_rule_dfs from Cell 3
    if 'per_rule_dfs' not in globals():
        raise ValueError("per_rule_dfs not found. Run Cell 3 first.")
    merged_pairs = merge_pairs(per_rule_dfs)
    
    # Create clusters in chunks
    if not merged_pairs.is_empty():
        logging.info(f"Processing {len(merged_pairs)} pairs for clustering")
        G = nx.Graph()
        chunk_size = 100000  # Reduced to 100K pairs per chunk
        for i in range(0, len(merged_pairs), chunk_size):
            chunk = merged_pairs[i:i + chunk_size]
            logging.info(f"Adding chunk {i // chunk_size + 1} with {len(chunk)} pairs")
            for row in chunk.iter_rows(named=True):
                G.add_edge(row["RecordID1"], row["RecordID2"], rules=row["RulesUsed"])
            # Clear memory
            del chunk
            gc.collect()
            logging.info(f"Memory after chunk {i // chunk_size + 1}: {process.memory_info().rss / 1024**2:.2f} MB")
        
        # Generate clusters
        logging.info("Generating clusters")
        clusters = []
        min_cluster_size = 2
        for i, component in enumerate(nx.connected_components(G)):
            if len(component) >= min_cluster_size:
                for record_id in component:
                    clusters.append({"ClusterID": i + 1, "RecordID": record_id})
        
        clusters_df = pd.DataFrame(clusters)
        if not clusters_df.empty:
            # Create Altair chart for cluster size distribution
            cluster_sizes = clusters_df.groupby("ClusterID").size().reset_index(name="Size")
            chart = (
                alt.Chart(cluster_sizes)
                .mark_bar()
                .encode(
                    x=alt.X("Size:Q", bin=alt.Bin(maxbins=20), title="Cluster Size (Records)"),
                    y=alt.Y("count():Q", title="Number of Clusters"),
                    tooltip=[
                        alt.Tooltip("Size:Q", title="Cluster Size"),
                        alt.Tooltip("count():Q", title="Number of Clusters")
                    ],
                    color=alt.Color("count():Q", scale=alt.Scale(scheme="greens"), title="Cluster Count")
                )
                .properties(
                    title="Distribution of Cluster Sizes",
                    width=600,
                    height=400
                )
            )
            print(f"✅ Created {len(clusters_df['ClusterID'].unique())} clusters with {len(clusters_df)} records")
            display(clusters_df)  # Display cluster DataFrame
            display(chart)        # Display Altair chart
        else:
            print("No clusters found with minimum size of 2")
    else:
        print("No candidate pairs to cluster")
except Exception as e:
    logging.error(f"Failed to merge pairs or create clusters: {str(e)}")
    raise
finally:
    # Clean up memory
    if 'G' in locals():
        del G
    if 'merged_pairs' in locals():
        del merged_pairs
    gc.collect()
    logging.info(f"Memory after cleanup: {process.memory_info().rss / 1024**2:.2f} MB")

2025-09-08 17:24:25,457 - INFO - Memory before merging: 201.14 MB
2025-09-08 17:24:25,460 - INFO - No pairs to merge
2025-09-08 17:24:25,747 - INFO - Memory after cleanup: 202.22 MB


No candidate pairs to cluster
