## **Feature:** Duplicate Detection

**Names:** Gia Bao Ngo

### **What it does**
Intelligently detects and handles duplicate records in datasets using exact matching and fuzzy string matching. Provides multiple strategies for duplicate removal and comprehensive analysis of duplicate patterns to help users understand data quality issues.

In [1]:
# Load dotenv
import os
from dotenv import load_dotenv
load_dotenv()

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
# Additional imports for duplicate detection
import math
import re
import datetime
from sklearn import preprocessing, impute
from difflib import SequenceMatcher

# Langchain imports
from langchain_openai import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

### **Helper Functions**
- `find_exact_duplicates(df, subset=None)` - Find rows with identical values
- `find_near_duplicates(df, threshold=0.95, subset=None)` - Find similar rows using fuzzy matching
- `remove_duplicates(df, strategy='first', subset=None)` - Remove duplicates with different strategies
- `flag_duplicates(df, subset=None)` - Add boolean column marking duplicates
- `analyze_duplicate_patterns(df, subset=None)` - Show duplicate statistics and patterns

In [2]:
def find_exact_duplicates(df, subset=None):
    """
    Find rows with identical values across specified columns.
    
    Parameters:
    - df: pandas DataFrame
    - subset: list of column names to check for duplicates (None = all columns)
    
    Returns:
    - DataFrame with duplicate rows
    """
    if subset is None:
        subset = df.columns.tolist()
    
    # Find duplicates
    duplicated_mask = df.duplicated(subset=subset, keep=False)
    duplicates = df[duplicated_mask]
    
    if len(duplicates) > 0:
        print(f"Found {len(duplicates)} duplicate rows across columns: {subset}")
        return duplicates.sort_values(by=subset)
    else:
        print("No exact duplicates found.")
        return pd.DataFrame()

In [3]:
def find_near_duplicates(df, threshold=0.95, subset=None):
    """
    Find similar rows using fuzzy string matching with difflib.
    
    Parameters:
    - df: pandas DataFrame
    - threshold: similarity threshold (0-1, default 0.95)
    - subset: list of column names to check (None = text columns only)
    
    Returns:
    - DataFrame with pairs of similar rows
    """
    if subset is None:
        # Focus on text columns for fuzzy matching
        subset = df.select_dtypes(include=['object']).columns.tolist()
    
    if not subset:
        print("No text columns found for fuzzy matching.")
        return pd.DataFrame()
    
    near_duplicates = []
    
    # Convert to string and combine text columns for comparison
    df_text = df[subset].astype(str)
    combined_text = df_text.apply(lambda x: ' '.join(x), axis=1)
    
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            similarity = SequenceMatcher(None, combined_text.iloc[i], combined_text.iloc[j]).ratio()
            if similarity >= threshold:
                near_duplicates.append({
                    'index_1': df.index[i],
                    'index_2': df.index[j],
                    'similarity': round(similarity, 3),
                    'text_1': combined_text.iloc[i][:100] + '...' if len(combined_text.iloc[i]) > 100 else combined_text.iloc[i],
                    'text_2': combined_text.iloc[j][:100] + '...' if len(combined_text.iloc[j]) > 100 else combined_text.iloc[j]
                })
    
    if near_duplicates:
        result_df = pd.DataFrame(near_duplicates)
        print(f"Found {len(near_duplicates)} pairs of near-duplicate rows (similarity >= {threshold})")
        return result_df
    else:
        print(f"No near-duplicates found with similarity >= {threshold}")
        return pd.DataFrame()

In [4]:
def remove_duplicates(df, strategy='first', subset=None):
    """
    Remove duplicates with different strategies.
    
    Parameters:
    - df: pandas DataFrame
    - strategy: 'first', 'last', or 'most_complete'
    - subset: list of column names to check for duplicates (None = all columns)
    
    Returns:
    - DataFrame with duplicates removed
    """
    if subset is None:
        subset = df.columns.tolist()
    
    original_count = len(df)
    
    if strategy == 'first':
        result_df = df.drop_duplicates(subset=subset, keep='first')
    elif strategy == 'last':
        result_df = df.drop_duplicates(subset=subset, keep='last')
    elif strategy == 'most_complete':
        # Keep the row with the fewest missing values
        duplicated_mask = df.duplicated(subset=subset, keep=False)
        if duplicated_mask.any():
            duplicated_groups = df[duplicated_mask].groupby(df[duplicated_mask][subset].apply(tuple, axis=1))
            keep_indices = []
            
            for name, group in duplicated_groups:
                # Find row with minimum missing values
                missing_counts = group.isnull().sum(axis=1)
                best_row_idx = missing_counts.idxmin()
                keep_indices.append(best_row_idx)
            
            # Keep non-duplicated rows and best duplicated rows
            non_duplicated = df[~duplicated_mask]
            best_duplicated = df.loc[keep_indices]
            result_df = pd.concat([non_duplicated, best_duplicated]).sort_index()
        else:
            result_df = df.copy()
    else:
        print(f"Unknown strategy '{strategy}'. Using 'first' instead.")
        result_df = df.drop_duplicates(subset=subset, keep='first')
    
    removed_count = original_count - len(result_df)
    print(f"Removed {removed_count} duplicate rows using '{strategy}' strategy")
    print(f"Dataset reduced from {original_count} to {len(result_df)} rows")
    
    return result_df

In [5]:
def flag_duplicates(df, subset=None):
    """
    Add boolean column marking duplicates for inspection before removal.
    
    Parameters:
    - df: pandas DataFrame
    - subset: list of column names to check for duplicates (None = all columns)
    
    Returns:
    - DataFrame with added 'is_duplicate' column
    """
    if subset is None:
        subset = df.columns.tolist()
    
    result_df = df.copy()
    result_df['is_duplicate'] = df.duplicated(subset=subset, keep=False)
    
    duplicate_count = result_df['is_duplicate'].sum()
    total_count = len(result_df)
    
    print(f"Flagged {duplicate_count} rows as duplicates out of {total_count} total rows")
    print(f"Duplicate percentage: {(duplicate_count/total_count)*100:.2f}%")
    
    if duplicate_count > 0:
        print("\nDuplicate rows marked with 'is_duplicate=True' column")
        print("Review flagged rows before deciding on removal strategy")
    
    return result_df

In [6]:
def analyze_duplicate_patterns(df, subset=None):
    """
    Show duplicate statistics and patterns to understand which columns contribute most to duplicates.
    
    Parameters:
    - df: pandas DataFrame
    - subset: list of column names to analyze (None = all columns)
    
    Returns:
    - DataFrame (unchanged, prints analysis)
    """
    if subset is None:
        subset = df.columns.tolist()
    
    print("=== DUPLICATE PATTERN ANALYSIS ===\n")
    
    # Overall duplicate statistics
    total_duplicates = df.duplicated(subset=subset, keep=False).sum()
    unique_duplicate_groups = df[df.duplicated(subset=subset, keep=False)].drop_duplicates(subset=subset).shape[0]
    
    print(f"Total duplicate rows: {total_duplicates}")
    print(f"Unique duplicate groups: {unique_duplicate_groups}")
    print(f"Average duplicates per group: {total_duplicates/unique_duplicate_groups if unique_duplicate_groups > 0 else 0:.2f}")
    
    # Column-wise duplicate analysis
    print("\n--- Column Contribution to Duplicates ---")
    duplicate_contributions = {}
    
    for col in subset:
        col_duplicates = df.duplicated(subset=[col], keep=False).sum()
        duplicate_contributions[col] = col_duplicates
    
    contribution_df = pd.DataFrame(list(duplicate_contributions.items()), 
                                  columns=['Column', 'Duplicate_Count'])
    contribution_df['Duplicate_Percentage'] = (contribution_df['Duplicate_Count'] / len(df) * 100).round(2)
    contribution_df = contribution_df.sort_values('Duplicate_Count', ascending=False)
    
    print(contribution_df.to_string(index=False))
    
    # Most duplicated values
    print("\n--- Most Frequent Duplicate Patterns ---")
    if total_duplicates > 0:
        duplicate_rows = df[df.duplicated(subset=subset, keep=False)]
        value_counts = duplicate_rows.groupby(subset).size().sort_values(ascending=False).head(5)
        
        for i, (values, count) in enumerate(value_counts.items(), 1):
            if isinstance(values, tuple):
                pattern = dict(zip(subset, values))
            else:
                pattern = {subset[0]: values}
            print(f"{i}. Count: {count}, Pattern: {pattern}")
    
    print(f"\n=== END ANALYSIS ===")
    return df

In [7]:
helper_docs = """ Helper functions available:
- find_exact_duplicates(df, subset=None): Find rows with identical values across specified columns. Returns DataFrame with duplicate rows.
- find_near_duplicates(df, threshold=0.95, subset=None): Find similar rows using fuzzy string matching with difflib. Returns DataFrame with pairs of similar rows and similarity scores.
- remove_duplicates(df, strategy='first', subset=None): Remove duplicates with different strategies ('first', 'last', 'most_complete'). Returns cleaned DataFrame.
- flag_duplicates(df, subset=None): Add boolean 'is_duplicate' column marking duplicates for inspection. Returns DataFrame with flag column.
- analyze_duplicate_patterns(df, subset=None): Print comprehensive analysis of duplicate patterns and statistics. Returns unchanged DataFrame.

Examples:
- "Find duplicate rows" -> find_exact_duplicates(df)
- "Remove duplicates keeping first" -> df = remove_duplicates(df, strategy='first')
- "Show duplicate patterns" -> analyze_duplicate_patterns(df)
- "Flag duplicates for review" -> df = flag_duplicates(df)
"""

# **MAIN FEATURE FUNCTION**

In [8]:
def duplicates(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (df, user_query) and return df
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=helper_docs))
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent focused on duplicate detection and handling.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - difflib (for fuzzy matching with SequenceMatcher)
    - All helper functions listed above
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions when appropriate for duplicate detection tasks
    - ASSUME "df" IS ALREADY DEFINED
    - For analysis queries, use helper functions that print results (analyze_duplicate_patterns, find_exact_duplicates)
    - For data cleaning, use helper functions that modify DataFrame (remove_duplicates, flag_duplicates)
    - ALWAYS assign the result back to df when modifying: df = remove_duplicates(df, strategy='first')
    - In order to generate a response/message to the user use print statements
    print("message")
    - Write a detailed print message to summarise actions taken and reasons
    
    Common query patterns:
    - "Find duplicates" or "Show duplicate rows" -> use find_exact_duplicates(df)
    - "Remove duplicates" -> df = remove_duplicates(df, strategy='first')  
    - "Analyze duplicate patterns" -> analyze_duplicate_patterns(df)
    - "Flag duplicates" or "Mark duplicates" -> df = flag_duplicates(df)
    - "Find similar records" -> find_near_duplicates(df, threshold=0.90)
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        # Create local namespace with our variables
        local_vars = {
            'df': df.copy(),
            'original_df': original_df,
            'pd': pd,
            'np': np,
            'find_exact_duplicates': find_exact_duplicates,
            'find_near_duplicates': find_near_duplicates,
            'remove_duplicates': remove_duplicates,
            'flag_duplicates': flag_duplicates,
            'analyze_duplicate_patterns': analyze_duplicate_patterns,
            'print': print
        }
        
        exec(generated_code, globals(), local_vars)
        return local_vars['df']
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

# **Testing**

In [9]:
# # Create sample data with known duplicates
# test_data = {
#     'Name': ['John Doe', 'Jane Smith', 'John Doe', 'Bob Wilson', 'Jane Smith', 'Alice Brown'],
#     'Age': [25, 30, 25, 35, 30, 28],
#     'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles', 'Boston'],
#     'Email': ['john@email.com', 'jane@email.com', 'john@email.com', 'bob@email.com', 'jane@email.com', 'alice@email.com']
# }

# test_df = pd.DataFrame(test_data)
# print("Test DataFrame:")
# print(test_df)

Test DataFrame:
          Name  Age         City            Email
0     John Doe   25     New York   john@email.com
1   Jane Smith   30  Los Angeles   jane@email.com
2     John Doe   25     New York   john@email.com
3   Bob Wilson   35      Chicago    bob@email.com
4   Jane Smith   30  Los Angeles   jane@email.com
5  Alice Brown   28       Boston  alice@email.com


In [13]:
# query = "drop duplicated rows"
# result = duplicates(test_df, query)

Removed 2 duplicate rows using 'first' strategy
Dataset reduced from 6 to 4 rows
Duplicated rows have been removed, keeping the first occurrence of each unique row.
