In [1]:
import os
import re
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
from metaphone import doublemetaphone
from pathlib import Path
import plotly.io as pio
import plotly.express as px

#for Splink
from splink.exploratory import completeness_chart
from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks, cumulative_comparisons_to_be_scored_from_blocking_rules_chart
from splink.exploratory import profile_columns
import splink.comparison_library as cl
from splink import block_on, Linker, SettingsCreator, DuckDBAPI

# Auto-blocking automation (enhanced functions)
from auto_blocking import (infer_roles_generic, ensure_derived_columns, build_settings, auto_generate_blocking_rules)

In [2]:
def clean_phone_number(phone):
    """
    Clean and standardize phone numbers to a consistent format.
    Returns a tuple of (cleaned_number, extension)
    
    Args:
        phone: The phone number string to clean
        
    Returns:
        pd.Series: Series with [cleaned_number, extension]
    """
    if pd.isnull(phone):
        return pd.Series([np.nan, np.nan])

    # Convert to string and remove whitespace
    phone_str = str(phone).lower().strip()

    # Regex to split extension (e.g., x123, ext.456)
    match = re.match(r'([^\\dxext]*\\d[\\d\\D]*?)(?:\\s*(?:ext\\.?|x)\\s*(\\d+))?$', phone_str)

    if match:
        main_part = re.sub(r'\\D', '', match.group(1))  # remove non-digits
        extension = match.group(2) if match.group(2) else np.nan
        return pd.Series([main_part, extension])
    else:
        return pd.Series([re.sub(r'\\D', '', phone_str), np.nan])


def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize and clean input data
    
    Args:
        df: Input DataFrame
        
    Returns:
        pd.DataFrame: Preprocessed DataFrame
    """
    df = df.copy()

    # --- Map user's schema to canonical columns expected downstream ---
    # surname -> last_name
    if 'last_name' not in df.columns and 'surname' in df.columns:
        df['last_name'] = df['surname']
    # address -> street
    if 'street' not in df.columns and 'address' in df.columns:
        df['street'] = df['address']
    # postal_code or postcode_fake -> zip
    if 'zip' not in df.columns:
        if 'postal_code' in df.columns:
            df['zip'] = df['postal_code']
        elif 'postcode_fake' in df.columns:
            df['zip'] = df['postcode_fake']
    # Ensure state column exists for downstream logic
    if 'state' not in df.columns:
        df['state'] = np.nan
    # Ensure first_name exists (user has first_name)
    # Ensure city exists (user has city)
    # Ensure email, phone exist (user has email, phone)

    # Clean text columns
    text_cols = ['first_name', 'last_name', 'street', 'city', 'state', 'email']
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

    # Format ZIP/Postcodes: normalize case and spacing (supports alphanumeric postcodes)
    if 'zip' in df.columns:
        df['zip'] = df['zip'].astype(str).str.strip()
        df['zip'] = df['zip'].str.replace(r'\\s+', '', regex=True).str.upper()

    # Clean phone numbers (safe apply with error handling)
    if 'phone' in df.columns:
        try:
            phone_split = df['phone'].apply(lambda x: clean_phone_number(x))
            df[['phone', 'phone_ext']] = pd.DataFrame(phone_split.tolist(), columns=['phone', 'phone_ext'])
        except Exception as e:
            print(f"Phone cleaning failed: {e}. Skipping.")
            df['phone'] = df['phone'].astype(str).str.replace(r'\\D', '', regex=True)
            df['phone_ext'] = np.nan

    # Extract email domain
    if 'email' in df.columns:
        df['email'] = df['email'].str.lower()
        df['email_domain'] = df['email'].str.extract(r'@([\\w\\.-]+)', expand=False)

    # To make entire DataFrame consistent
    df = df.where(pd.notnull(df), np.nan)

    # Create full name if missing
    if 'full_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
        df['full_name'] = df['first_name'].fillna('') + ' ' + df['last_name'].fillna('')
    if 'full_name' in df.columns:
        df['full_name'] = df['full_name'].astype(str).str.strip().str.lower()

    # Create phonetic encodings
    if 'first_name' in df.columns:
        df['first_name_metaphone'] = df['first_name'].apply(
            lambda x: doublemetaphone(str(x))[0] if pd.notnull(x) and x else np.nan
        )
    
    if 'last_name' in df.columns:
        df['last_name_metaphone'] = df['last_name'].apply(
            lambda x: doublemetaphone(str(x))[0] if pd.notnull(x) and x else np.nan
        )

    return df

In [3]:
def combine_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """Combine two preprocessed DataFrames, reset index, and add unique ID."""
    combined_df = pd.concat([df1, df2], ignore_index=True)
    combined_df.reset_index(drop=True)
    combined_df["unique_id"] = combined_df.index.astype(str)
    return combined_df

def identify_and_show_duplicates(df: pd.DataFrame, subset_cols: list) -> pd.DataFrame:
    """
    Create a 'duplicate_of' column showing the duplicate occurrences.
    
    Args:
        df (pd.DataFrame): Input DataFrame.
        subset_cols (list): List of column names to check for duplicates.
        
    Returns:
        pd.DataFrame: Updated DataFrame where 'duplicate_of' is 2 or more.
    """
    duplicates = df[df.duplicated(subset=subset_cols)]
    
    return duplicates


def analyze_completeness(df: pd.DataFrame, db_api: DuckDBAPI):
    """Generate completeness chart."""
    return completeness_chart(df, db_api=db_api)

def profile_data(df: pd.DataFrame, db_api: DuckDBAPI, top_n: int = 10, bottom_n: int = 5):
    """Profile selected columns using DuckDBAPI."""
    return profile_columns(df, db_api=db_api, top_n=top_n, bottom_n=bottom_n)


In [4]:
# Helpers compatible with pandas DataFrames AND DuckDB views
from typing import Union

PandasOrView = Union[pd.DataFrame, str]


def combine_dataframes(df1: PandasOrView, df2: PandasOrView, db_api: DuckDBAPI = None, view_name: str = "combined_view") -> PandasOrView:
    """
    Combine two datasets and add a unique_id.

    - If df1/df2 are pandas DataFrames: return a pandas DataFrame with unique_id.
    - If df1/df2 are DuckDB view/table names (str) and db_api is provided: create/replace
      a DuckDB view that unions them and adds unique_id via row_number(). Returns the view name.
    """
    if isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame):
        combined_df = pd.concat([df1, df2], ignore_index=True)
        combined_df.reset_index(drop=True)
        combined_df["unique_id"] = combined_df.index.astype(str)
        return combined_df

    if isinstance(df1, str) and isinstance(df2, str) and db_api is not None:
        conn = db_api.connection  # type: ignore[attr-defined]
        conn.execute(f"""
            CREATE OR REPLACE VIEW {view_name} AS
            SELECT *, CAST(row_number() OVER () - 1 AS VARCHAR) AS unique_id
            FROM (
                SELECT * FROM {df1}
                UNION ALL
                SELECT * FROM {df2}
            )
        """)
        return view_name

    raise ValueError("combine_dataframes expects two pandas DataFrames OR two DuckDB table/view names with db_api provided.")


def identify_and_show_duplicates(df_or_view: PandasOrView, subset_cols: list, db_api: DuckDBAPI = None, sample_limit: int = 1000) -> pd.DataFrame:
    """
    Identify duplicate rows based on subset_cols.

    - If given a pandas DataFrame: returns a pandas DataFrame of duplicates.
    - If given a DuckDB view/table name (str) and db_api: returns a pandas sample from DuckDB (up to sample_limit).
    """
    if not subset_cols:
        raise ValueError("subset_cols must be a non-empty list of column names")

    if isinstance(df_or_view, pd.DataFrame):
        return df_or_view[df_or_view.duplicated(subset=subset_cols, keep=False)]

    if isinstance(df_or_view, str) and db_api is not None:
        cols = ", ".join(subset_cols)
        conn = db_api.connection  # type: ignore[attr-defined]
        query = f"""
            WITH dup_keys AS (
                SELECT {cols}, COUNT(*) AS cnt
                FROM {df_or_view}
                GROUP BY {cols}
                HAVING COUNT(*) > 1
            )
            SELECT s.*
            FROM {df_or_view} s
            INNER JOIN dup_keys d
            USING ({cols})
            LIMIT {int(sample_limit)}
        """
        return conn.execute(query).fetch_df()

    raise ValueError("identify_and_show_duplicates expects a pandas DataFrame OR a DuckDB table/view name with db_api provided.")


def analyze_completeness(df_or_view: PandasOrView, db_api: DuckDBAPI):
    return completeness_chart(df_or_view, db_api=db_api)


def profile_data(df_or_view: PandasOrView, db_api: DuckDBAPI, top_n: int = 10, bottom_n: int = 5):
    return profile_columns(df_or_view, db_api=db_api, top_n=top_n, bottom_n=bottom_n)


In [5]:
try:
    import trino
    from trino.dbapi import connect as trino_connect
    from trino.auth import BasicAuthentication
except Exception as e:
    raise ImportError("trino Python package is required. Add 'trino' to requirements and pip install it.")

# --- Connection config (edit defaults as needed) ---
TRINO_HOST = os.getenv("TRINO_HOST", "3.108.199.0")
TRINO_PORT = int(os.getenv("TRINO_PORT", "32092"))
TRINO_USER = os.getenv("TRINO_USER", "root")
TRINO_CATALOG = os.getenv("TRINO_CATALOG", "hive")  # Hive connector pointing to MinIO-backed data
TRINO_HTTP_SCHEME = os.getenv("TRINO_HTTP_SCHEME", "http")  # or "https"
# No auth required per your setup
auth = None

conn = trino_connect(
    host=TRINO_HOST,
    port=TRINO_PORT,
    user=TRINO_USER,
    catalog=TRINO_CATALOG,
    http_scheme=TRINO_HTTP_SCHEME,
    auth=auth,
)

cur = conn.cursor()

# --- List schemas and prompt user ---
cur.execute("SHOW SCHEMAS")
schemas = sorted([r[0] for r in cur.fetchall()])
print(f"Available schemas:")
for i, s in enumerate(schemas):
    print(f"  {i+1}. {s}")

schema_input = input("Enter schema name (exact) from the list above: ").strip()
if schema_input not in schemas:
    raise ValueError(f"Schema '{schema_input}' not found in catalog {TRINO_CATALOG}")

# --- List tables in chosen schema ---
cur.execute(f"SHOW TABLES FROM {TRINO_CATALOG}.{schema_input}")
tables = sorted([r[0] for r in cur.fetchall()])
print(f"Tables in {TRINO_CATALOG}.{schema_input}:")
for i, t in enumerate(tables):
    print(f"  {i+1}. {t}")

raw_tables = input("Enter one or more table names (comma-separated) to dedupe: ").strip()
selected_tables = [t.strip() for t in raw_tables.split(",") if t.strip()]
if len(selected_tables) < 1:
    raise ValueError("Please provide at least 1 table name.")

# --- Load tables to pandas (optionally limit rows for speed) ---
row_limit = input("Optional row limit per table (blank for no limit): ").strip()
limit_sql = ""
if row_limit:
    try:
        limit_n = int(row_limit)
        limit_sql = f" LIMIT {limit_n}"
    except Exception:
        print("Invalid limit provided; loading full tables.")
        limit_sql = ""

loaded_dfs = []
for tbl in selected_tables:
    q = f"SELECT * FROM {TRINO_CATALOG}.{schema_input}.{tbl}{limit_sql}"
    print(f"Loading: {q}")
    cur.execute(q)
    rows = cur.fetchall()
    cols = [d[0] for d in cur.description]
    df_tbl = pd.DataFrame(rows, columns=cols)
    loaded_dfs.append((tbl, df_tbl))

# --- Map/clean using existing preprocess and combine logic ---
preprocessed = []
for tbl, df in loaded_dfs:
    print(f"Preprocessing table: {tbl} (rows={len(df):,})")
    preprocessed.append(preprocess_data(df))

# Concatenate any number of tables into one combined_df
combined_df = pd.concat(preprocessed, ignore_index=True)
combined_df["unique_id"] = combined_df.index.astype(str)

print(f"combined_df ready: {len(combined_df):,} rows; columns: {len(combined_df.columns)}")

# Close connection
cur.close()
conn.close()

Available schemas:
  1. archive
  2. default
  3. information_schema
  4. ml
  5. new_schema
  6. sample
  7. test
  8. test11
  9. test_schema
  10. testing
  11. tpch_1
Tables in hive.ml:
  1. _schema
  2. _schema_23062023_091242
  3. abc
  4. abc_20220131
  5. abc_20230605
  6. availableequipment3g
  7. availableequipment3g_03012024_072141
  8. call_center
  9. call_center_01162024_100823
  10. call_center_02062023_092201
  11. call_center_02062023_092201_20230602
  12. call_center_02062023_092201_20230603
  13. call_center_02062023_092201_20230604
  14. call_center_02062023_092201_20230606
  15. call_center_06302023_103513
  16. call_center_08112023_104615
  17. call_center_08302023_085409
  18. call_center_09192024_113326
  19. call_center_11052024_045634
  20. call_center_12052023_062939
  21. call_center_12052023_120858
  22. call_center_20230602
  23. call_center_20230603
  24. call_center_20230604
  25. call_center_20230605
  26. call_center_20230609
  27. call_center_20230610

In [6]:
# Quick data inspection (like in notebook.ipynb)
print(f"Combined data shape: {len(combined_df):,} rows x {len(combined_df.columns)} columns")
print("Columns:", list(combined_df.columns))
print("\nSample data:")
print(combined_df.head(3).to_string())

Combined data shape: 50,578 rows x 24 columns
Columns: ['full_name', 'first_and_surname', 'first_name', 'surname', 'dob', 'birth_place', 'postcode_fake', 'gender', 'occupation', 'email', 'phone', 'address', 'city', 'country', 'postal_code', 'last_name', 'street', 'zip', 'state', 'phone_ext', 'email_domain', 'first_name_metaphone', 'last_name_metaphone', 'unique_id']

Sample data:
           full_name  first_and_surname first_name     surname         dob       birth_place postcode_fake gender occupation                                email         phone                      address              city     country postal_code   last_name                       street      zip state  phone_ext email_domain first_name_metaphone last_name_metaphone unique_id
0  arthur englefield  arthur englefield     arthur  englefield  1938-01-01  stockton-on-tees      ts19 0sh   male    painter  arthurenglefield7029_at_hotmail.com  +0) 199-7096     stockton-on-tees Streeet  stockton-on-tees  Uzbekistan     

In [7]:
# Debug: Verify auto_blocking import and input data
try:
    from auto_blocking import (
        infer_roles_generic, ensure_derived_columns, build_settings,
        auto_generate_blocking_rules
    )
    print("Successfully imported auto_generate_blocking_rules from auto_blocking")
except ImportError as e:
    print(f"ImportError: {e}")
    raise

# Debug: Inspect input data
print(f"combined_df shape: {combined_df.shape}")
print(f"Columns: {list(combined_df.columns)}")
print(f"Sample data:\n{combined_df.head(3).to_string()}")

# Automated blocking rules and settings generation (data-driven, like notebook.ipynb but fully auto)
# Sample if too large (prevent crashes/memory issues)
if len(combined_df) > 50000:
    print("Data large (>50k rows); sampling to 50k for rule generation/analysis.")
    combined_df_sample = combined_df.sample(n=50000, random_state=42).reset_index(drop=True)
else:
    combined_df_sample = combined_df.copy()

_db = DuckDBAPI()
try:
    _db.connection.execute("PRAGMA threads=auto")
    _db.connection.execute("PRAGMA max_temp_directory_size='10GiB'")
except Exception as e:
    print(f"Warning: Failed to set DuckDB PRAGMAs: {e}")

# Auto-generate rules and infer roles (enhanced automation)
try:
    print(f"combined_df_sample shape: {combined_df_sample.shape}")
    print(f"Sample columns: {list(combined_df_sample.columns)}")
    blocking_rules, roles = auto_generate_blocking_rules(combined_df_sample, _db, max_rules=3)
    print(f"\nAuto-generated {len(blocking_rules)} blocking rules based on data characteristics.")
    print("Inferred roles used for rules/settings:", roles)
    
    # Build settings dynamically from roles (data-driven parameters)
    settings = build_settings(roles)
    print("Settings built with comparisons for detected roles (e.g., names, email, geo).")
    
except Exception as e:
    print(f"Error in auto-generation: {e}")
    raise

# Cleanup temp DB
del _db

Successfully imported auto_generate_blocking_rules from auto_blocking
combined_df shape: (50578, 24)
Columns: ['full_name', 'first_and_surname', 'first_name', 'surname', 'dob', 'birth_place', 'postcode_fake', 'gender', 'occupation', 'email', 'phone', 'address', 'city', 'country', 'postal_code', 'last_name', 'street', 'zip', 'state', 'phone_ext', 'email_domain', 'first_name_metaphone', 'last_name_metaphone', 'unique_id']
Sample data:
           full_name  first_and_surname first_name     surname         dob       birth_place postcode_fake gender occupation                                email         phone                      address              city     country postal_code   last_name                       street      zip state  phone_ext email_domain first_name_metaphone last_name_metaphone unique_id
0  arthur englefield  arthur englefield     arthur  englefield  1938-01-01  stockton-on-tees      ts19 0sh   male    painter  arthurenglefield7029_at_hotmail.com  +0) 199-7096     stock

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Rule 'exact_zip': comparisons=inf, type=<class 'float'>
Rule 'exact_city': comparisons=inf, type=<class 'float'>


Computation of number of comparisons post-filter conditions was skipped because the number of comparisons generated by your blocking rule exceeded max_rows_limit=1.00e+09.
It would be likely to be slow to compute.
If you still want to go ahead increase the value of max_rows_limit argument to above 2.500e+09.
Read more about the definitions here:
https://moj-analytical-services.github.io/splink/topic_guides/blocking/performance.html?h=filter+cond#filter-conditions


Rule 'exact_state': comparisons=inf, type=<class 'float'>


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Rule 'first_name_initial': comparisons=inf, type=<class 'float'>
Rule 'last_name_metaphone': comparisons=inf, type=<class 'float'>

Blocking rule analysis:
✗ exact_email: inf comparisons (invalid_count)
✗ exact_email: inf comparisons (invalid_count)
✗ exact_phone: inf comparisons (invalid_count)
✗ metaphone_full_name: inf comparisons (invalid_count)
✗ zip_lastname: inf comparisons (invalid_count)
✗ city_firstname_initial: inf comparisons (invalid_count)
✗ exact_zip: inf comparisons (invalid_count)
✗ exact_city: inf comparisons (invalid_count)
✗ exact_state: inf comparisons (invalid_count)
✗ first_name_initial: inf comparisons (invalid_count)
✗ last_name_metaphone: inf comparisons (invalid_count)
✓ exact_email: inf comparisons

Auto-generated 1 blocking rules based on data characteristics.
Inferred roles used for rules/settings: {'first_name': 'first_name', 'last_name': 'last_name', 'full_name': 'full_name', 'email': 'email', 'phone': 'phone', 'zip': 'zip', 'city': 'city', 'state': 'sta

In [8]:
# Analyze completeness of data
db_api = DuckDBAPI()
analyze_completeness(combined_df, db_api)

In [9]:
# Profile the data
profile_data(combined_df, db_api)



In [10]:
# Train using auto-generated settings and blocking rules
from typing import List, Tuple, Optional

def train_and_save_model(path: str, df: pd.DataFrame, settings: Optional[object] = None, blocking_rules: Optional[List[Tuple[str, object]]] = None):
    import json
    # Auto-generate settings and rules if not provided to preserve old call signature
    if settings is None or blocking_rules is None:
        from auto_blocking import infer_roles_generic, ensure_derived_columns, propose_blocking_rules, build_settings
        roles = infer_roles_generic(df)
        df = ensure_derived_columns(df, roles)
        if settings is None:
            settings = build_settings(roles)
        if blocking_rules is None:
            blocking_rules = propose_blocking_rules(df, roles)

    # Sample for faster EM training
    sample_size = min(len(df), 20000)
    training_df = df.sample(n=sample_size, random_state=42) if len(df) > sample_size else df

    # Initialise linker with auto settings
    train_linker = Linker(training_df, settings, db_api=DuckDBAPI())

    # Convert (label, block_on(...)) -> block_on objects
    br_objects = [rule for _, rule in blocking_rules]

    # Compute comparisons per rule, keep non-zero, prefer smaller blocks
    rule_counts = []
    for br in br_objects:
        try:
            c = train_linker.count_num_comparisons_from_blocking_rule(br)
        except Exception:
            c = 0
        rule_counts.append((br, c))
    rule_counts.sort(key=lambda x: x[1])
    valid_rules = [br for br, c in rule_counts if c > 0]
    if not valid_rules:
        # Emergency fallback: create hash buckets on the sampled training_df to guarantee pairs
        # This does not change your original df; only used for training the parameters
        training_df = training_df.copy()
        training_df["__bucket__"] = (training_df.reset_index().index % 100).astype(int)
        # Recreate linker on the augmented training data
        train_linker = Linker(training_df, settings, db_api=DuckDBAPI())
        bucket_rule = block_on("__bucket__")
        subset_for_prior = [bucket_rule]
    else:
        # Estimate m prior (probability two random records match) using top few rules
        subset_for_prior = valid_rules[:3] if len(valid_rules) > 3 else valid_rules
    train_linker.training.estimate_probability_two_random_records_match(subset_for_prior, recall=0.6)

    # Estimate u using random sampling
    train_linker.training.estimate_u_using_random_sampling(max_pairs=2_000_000)

    # EM parameter estimation using same subset
    subset_for_em = subset_for_prior
    for br in subset_for_em:
        train_linker.training.estimate_parameters_using_expectation_maximisation(br)

    # Save trained settings
    train_linker.misc.save_model_to_json(path, overwrite=True)

    # Load trained settings and build full-data linker
    with open(path, 'r', encoding='utf-8') as f:
        trained_settings = json.load(f)
    full_linker = Linker(df, trained_settings, db_api=DuckDBAPI())
    return full_linker

In [11]:
def visualize_model(linker):
    # Set renderer for VS Code/Notebook environments
    try:
        pio.renderers.default = "vscode"
    except Exception:
        pio.renderers.default = "notebook_connected"

    # Generate figures
    fig1 = linker.visualisations.match_weights_chart()
    fig2 = linker.visualisations.m_u_parameters_chart()
    fig3 = linker.visualisations.parameter_estimate_comparisons_chart()

    # Resize to avoid clipping
    for f in (fig1, fig2, fig3):
        try:
            f.update_layout(width=1400, height=800)
        except Exception:
            pass

    # Display
    display(fig1)
    display(fig2)
    display(fig3)

In [12]:
linker = train_and_save_model("./final_model3.json",combined_df)
display_of_model_visualizations = visualize_model(linker)

ImportError: cannot import name 'propose_blocking_rules' from 'auto_blocking' (c:\Users\AbhayPandey\Desktop\pASSIONBYTES\auto_blocking.py)

In [13]:
def generate_predictions(linker, prediction_path: str, cluster_path: str,threshold: float):
    df_predictions = linker.inference.predict()
    df_predictions_pd = df_predictions.as_pandas_dataframe() 
    df_predictions_pd = df_predictions_pd[df_predictions_pd["match_probability"] > threshold]
    df_predictions_pd.to_csv(prediction_path, index=False)

    clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
        df_predictions, threshold_match_probability=threshold)
    clusters_pd = clusters.as_pandas_dataframe()
    clusters_pd.to_csv(cluster_path, index=False)
    return df_predictions_pd, clusters_pd


In [14]:
df_preds, clusters = generate_predictions(linker, "splink_predictions.csv", "splink_clusters.csv",0.99)

NameError: name 'linker' is not defined

In [None]:
def get_deduped_id_mapping(df: pd.DataFrame) -> Dict[int, int]:
    """Create a mapping of deduplicated IDs, mapping unique_id_r to the smallest unique_id_l."""
    deduped_ids = df.groupby('unique_id_r')['unique_id_l'].first().reset_index()
    id_mapping = pd.Series(deduped_ids['unique_id_l'].values, index=deduped_ids['unique_id_r']).to_dict()
    int_id_mapping = {int(k): int(v) for k, v in id_mapping.items()}
    return int_id_mapping


def deduplicate_by_mapped_ids(df: pd.DataFrame,column_name:str,id_mapping,output_path:str) -> None:
    """Convert the 'unique_id' column to integer type if necessary."""
    if df[column_name].dtype != 'int64':
        df[column_name] = df[column_name].astype(int)
    df.loc[:, column_name] = df[column_name].replace(id_mapping)
    df = df.drop_duplicates(subset=column_name, keep='last')
    df.to_csv(output_path,index=False)


In [None]:
int_mapping = get_deduped_id_mapping(df_preds)
deduplicate_by_mapped_ids(combined_df,"unique_id",int_mapping,"merged.csv")