In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import recordlinkage
from recordlinkage.index import Block
import warnings
warnings.filterwarnings('ignore')

class MLDeduplicator:
    """
    Advanced deduplication using machine learning to identify similar records.
    
    Features:
    - Handles both text and numeric data
    - Configurable similarity thresholds
    - Multiple matching strategies
    - Interactive duplicate review
    """
    
    def __init__(self, n_neighbors=5, text_similarity_threshold=0.85, numeric_similarity_threshold=0.9):
        self.n_neighbors = n_neighbors
        self.text_sim_thresh = text_similarity_threshold
        self.num_sim_thresh = numeric_similarity_threshold
        
    def _preprocess_data(self, df, text_cols, numeric_cols):
        """Create feature vectors for similarity comparison"""
        # Text processing pipeline
        text_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english', analyzer='char', ngram_range=(2, 4)))
        ])
        
        # Numeric processing pipeline
        numeric_pipeline = Pipeline([
            ('scaler', StandardScaler())
        ])
        
        # Combined preprocessor
        preprocessor = ColumnTransformer([
            ('text', text_pipeline, text_cols),
            ('numeric', numeric_pipeline, numeric_cols)
        ])
        
        return preprocessor.fit_transform(df)
    
    def _find_candidate_pairs(self, df, text_cols, numeric_cols):
        """Find potential duplicate pairs using blocking and nearest neighbors"""
        # Create blocking index
        indexer = recordlinkage.Index()
        if text_cols:
            indexer.add(Block(text_cols[0]))
        if numeric_cols:
            indexer.add(Block(numeric_cols[0]))
        candidate_pairs = indexer.index(df)
        
        # Get feature vectors
        X = self._preprocess_data(df, text_cols, numeric_cols)
        
        # Find nearest neighbors
        nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='cosine')
        nn.fit(X)
        distances, indices = nn.kneighbors(X)
        
        # Create pairs from nearest neighbors
        pairs = set()
        for i in range(len(indices)):
            for j, dist in zip(indices[i], distances[i]):
                if i != j and (j, i) not in pairs:
                    pairs.add((i, j))
        
        return list(pairs), X
    
    def _calculate_similarities(self, pairs, X, df, text_cols, numeric_cols):
        """Calculate detailed similarity metrics for candidate pairs"""
        results = []
        
        for i, j in pairs:
            # Overall feature similarity
            feature_sim = 1 - cosine_similarity([X[i]], [X[j]])[0][0]
            
            # Text similarity (average of text columns)
            text_sim = 0
            if text_cols:
                text_sims = []
                for col in text_cols:
                    vec1 = TfidfVectorizer().fit_transform([df.at[i, col]])
                    vec2 = TfidfVectorizer().fit_transform([df.at[j, col]])
                    text_sims.append(cosine_similarity(vec1, vec2)[0][0])
                text_sim = np.mean(text_sims)
            
            # Numeric similarity (average of numeric columns)
            num_sim = 0
            if numeric_cols:
                num_sims = []
                for col in numeric_cols:
                    val1 = df.at[i, col]
                    val2 = df.at[j, col]
                    if pd.notna(val1) and pd.notna(val2):
                        num_sims.append(1 - abs(val1 - val2) / (df[col].max() - df[col].min()))
                num_sim = np.mean(num_sims) if num_sims else 0
            
            results.append({
                'id1': i,
                'id2': j,
                'feature_similarity': feature_sim,
                'text_similarity': text_sim,
                'numeric_similarity': num_sim,
                'is_duplicate': (text_sim >= self.text_sim_thresh) and 
                               (num_sim >= self.num_sim_thresh)
            })
        
        return pd.DataFrame(results)
    
    def deduplicate(self, df, text_cols=None, numeric_cols=None):
        """
        Identify and flag duplicate records based on feature similarity.
        
        Parameters:
            df (pd.DataFrame): Input dataframe
            text_cols (list): List of text columns to compare
            numeric_cols (list): List of numeric columns to compare
            
        Returns:
            pd.DataFrame: Original dataframe with duplicate flags
            pd.DataFrame: Similarity matrix of potential duplicates
        """
        if text_cols is None:
            text_cols = []
        if numeric_cols is None:
            numeric_cols = []
        
        # Find candidate pairs and preprocess data
        pairs, X = self._find_candidate_pairs(df, text_cols, numeric_cols)
        
        # Calculate detailed similarities
        similarity_df = self._calculate_similarities(pairs, X, df, text_cols, numeric_cols)
        
        # Mark duplicates in original dataframe
        duplicate_ids = set()
        for _, row in similarity_df[similarity_df['is_duplicate']].iterrows():
            duplicate_ids.add(row['id2'])
        
        df['is_duplicate'] = False
        df.loc[df.index.isin(duplicate_ids), 'is_duplicate'] = True
        
        return df, similarity_df
    
    def interactive_review(self, df, similarity_df):
        """Interactive tool to review and confirm duplicates"""
        print(f"Found {len(similarity_df)} potential duplicate pairs")
        print(f"Auto-flagged {similarity_df['is_duplicate'].sum()} as duplicates")
        
        # Filter for high similarity but not auto-flagged
        review_df = similarity_df[
            (similarity_df['feature_similarity'] > 0.7) & 
            (~similarity_df['is_duplicate'])
        ].sort_values('feature_similarity', ascending=False)
        
        for _, row in review_df.iterrows():
            print("\n" + "="*50)
            print(f"Pair {row['id1']} and {row['id2']}")
            print(f"Similarity Scores:")
            print(f"  Feature: {row['feature_similarity']:.2f}")
            print(f"  Text: {row['text_similarity']:.2f}")
            print(f"  Numeric: {row['numeric_similarity']:.2f}")
            
            print("\nRecord 1:")
            print(df.iloc[row['id1']])
            print("\nRecord 2:")
            print(df.iloc[row['id2']])
            
            response = input("\nAre these duplicates? (y/n/skip): ").lower()
            if response == 'y':
                df.at[row['id2'], 'is_duplicate'] = True
            elif response == 'skip':
                break
        
        return df

# Example Usage
if __name__ == "__main__":
    # Sample customer data with potential duplicates
    data = {
        'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'name': ['John Smith', 'Jon Smith', 'John Smyth', 'Alice Johnson', 'A. Johnson', 
                'Robert Brown', 'Bob Brown', 'Robert Browning', 'Mary Wilson', 'Marie Wilson'],
        'address': ['123 Main St', '123 Main Street', '123 Main St', '456 Oak Ave', '456 Oak Avenue',
                   '789 Pine Rd', '789 Pine Road', '789 Pines Rd', '321 Elm Dr', '321 Elm Drive'],
        'age': [35, 35, 36, 28, 28, 42, 41, 42, 55, 54],
        'join_date': ['2020-01-15', '2020-01-15', '2020-01-16', '2019-05-20', '2019-05-20',
                     '2021-02-10', '2021-02-11', '2021-02-10', '2018-11-05', '2018-11-05']
    }
    df = pd.DataFrame(data)
    
    # Initialize deduplicator
    deduper = MLDeduplicator(
        n_neighbors=3,
        text_similarity_threshold=0.8,
        numeric_similarity_threshold=0.85
    )
    
    # Perform deduplication
    text_cols = ['name', 'address']
    numeric_cols = ['age']
    deduped_df, similarity_df = deduper.deduplicate(df, text_cols, numeric_cols)
    
    # Interactive review
    final_df = deduper.interactive_review(deduped_df, similarity_df)
    
    # Results
    print("\nFINAL DUPLICATE COUNTS:")
    print(final_df['is_duplicate'].value_counts())
    
    print("\nDUPLICATE RECORDS:")
    print(final_df[final_df['is_duplicate']])




ModuleNotFoundError: No module named 'recordlinkage'