gfhfh

In [3]:
# Activity 2: Dealing with Duplicates & Redundancy

# Task A: Identifying Duplicate Records

# 7. Identify Complete Duplicates:
# - Load a dataset and identify duplicated rows.
# - Use Pandas to detect duplicates.






# 8. Identify Duplicates based on Specific Columns:
# - Check for duplicates in specified columns.







# 9. Count Duplicate Rows:
# - Calculate and print the number of duplicate rows.
import pandas as pd

def load_and_analyze_duplicates(file_path, duplicate_columns=None):
    """
    Analyze duplicates in a dataset with options for complete or column-specific checks
    
    Parameters:
        file_path (str): Path to the dataset file
        duplicate_columns (list): Columns to check for duplicates (None for complete rows)
    """
    # Load the dataset
    df = pd.read_csv(file_path)
    
    print(f"\nLoaded dataset with {len(df)} rows and {len(df.columns)} columns")
    print("First 5 rows:")
    print(df.head())
    
    # 7. Identify Complete Duplicates
    if duplicate_columns is None:
        print("\n=== Checking for Complete Duplicate Rows ===")
        duplicates = df[df.duplicated(keep=False)]  # keep=False marks all duplicates
    else:
        print(f"\n=== Checking for Duplicates in Columns: {duplicate_columns} ===")
        duplicates = df[df.duplicated(subset=duplicate_columns, keep=False)]
    
    # 8. Display duplicate records
    if len(duplicates) > 0:
        print(f"\nFound {len(duplicates)} duplicate records (all instances shown):")
        print(duplicates.sort_values(duplicate_columns if duplicate_columns else df.columns.tolist()))
    else:
        print("\nNo duplicates found based on the specified criteria")
    
    # 9. Count duplicate rows
    if duplicate_columns is None:
        duplicate_count = df.duplicated().sum()
    else:
        duplicate_count = df.duplicated(subset=duplicate_columns).sum()
    
    print(f"\nTotal duplicate rows (excluding first occurrence): {duplicate_count}")
    print(f"Percentage of duplicates: {duplicate_count/len(df):.2%}")
    
    return duplicates

# Example Usage
if __name__ == "__main__":
    # Create sample data (in practice, use your actual file path)
    data = {
        'CustomerID': [101, 102, 103, 101, 104, 102, 105, 101],
        'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Bob', 'Eve', 'Alice'],
        'Email': ['a@test.com', 'b@test.com', 'c@test.com', 'a@test.com', 
                 'd@test.com', 'b@test.com', 'e@test.com', 'a@test.com'],
        'JoinDate': ['2023-01-01', '2023-02-01', '2023-03-01', '2023-01-01',
                    '2023-04-01', '2023-02-01', '2023-05-01', '2023-01-01'],
        'PurchaseAmount': [100, 150, 200, 100, 175, 150, 225, 100]
    }
    sample_df = pd.DataFrame(data)
    sample_df.to_csv('customer_data.csv', index=False)
    
    print("=== Analyzing Complete Duplicates ===")
    complete_dupes = load_and_analyze_duplicates('customer_data.csv')
    
    print("\n\n=== Analyzing Email Duplicates ===")
    email_dupes = load_and_analyze_duplicates('customer_data.csv', duplicate_columns=['Email'])
    
    print("\n\n=== Analyzing CustomerID Duplicates ===")
    id_dupes = load_and_analyze_duplicates('customer_data.csv', duplicate_columns=['CustomerID'])







=== Analyzing Complete Duplicates ===

Loaded dataset with 8 rows and 5 columns
First 5 rows:
   CustomerID     Name       Email    JoinDate  PurchaseAmount
0         101    Alice  a@test.com  2023-01-01             100
1         102      Bob  b@test.com  2023-02-01             150
2         103  Charlie  c@test.com  2023-03-01             200
3         101    Alice  a@test.com  2023-01-01             100
4         104    David  d@test.com  2023-04-01             175

=== Checking for Complete Duplicate Rows ===

Found 5 duplicate records (all instances shown):
   CustomerID   Name       Email    JoinDate  PurchaseAmount
0         101  Alice  a@test.com  2023-01-01             100
3         101  Alice  a@test.com  2023-01-01             100
7         101  Alice  a@test.com  2023-01-01             100
1         102    Bob  b@test.com  2023-02-01             150
5         102    Bob  b@test.com  2023-02-01             150

Total duplicate rows (excluding first occurrence): 3
Percentage o

In [4]:
# Task B: Deduplication Techniques

# 10. Remove Complete Duplicates:
# - Drop duplicate rows and keep only the first occurrence.






# 11. Subset Deduplication:
# - Remove duplicates based on a subset of columns.






# 12. Keep Last Occurrence:
# - Drop duplicates but keep the last occurrence in the dataset.
import pandas as pd

def demonstrate_deduplication(file_path):
    """
    Demonstrate various deduplication techniques on a dataset
    
    Parameters:
        file_path (str): Path to the dataset file
    """
    # Load the dataset
    df = pd.read_csv(file_path)
    
    print(f"\nOriginal dataset shape: {df.shape}")
    print("Sample data:")
    print(df.head())
    
    # 10. Remove Complete Duplicates (keep first)
    df_first = df.drop_duplicates()
    print("\n=== 10. Complete Deduplication (Keep First) ===")
    print(f"Removed {len(df) - len(df_first)} duplicate rows")
    print(f"Resulting dataset shape: {df_first.shape}")
    df_first.to_csv('deduplicated_first.csv', index=False)
    print("Saved as 'deduplicated_first.csv'")
    
    # 11. Subset Deduplication (example: based on 'Email' column)
    subset_cols = ['Email']  # Change to your desired columns
    df_subset = df.drop_duplicates(subset=subset_cols)
    print("\n=== 11. Subset Deduplication (Columns: {}) ===".format(', '.join(subset_cols)))
    print(f"Removed {len(df) - len(df_subset)} duplicate rows based on columns: {subset_cols}")
    print(f"Resulting dataset shape: {df_subset.shape}")
    df_subset.to_csv('deduplicated_subset.csv', index=False)
    print("Saved as 'deduplicated_subset.csv'")
    
    # 12. Keep Last Occurrence
    df_last = df.drop_duplicates(keep='last')
    print("\n=== 12. Complete Deduplication (Keep Last) ===")
    print(f"Removed {len(df) - len(df_last)} duplicate rows")
    print(f"Resulting dataset shape: {df_last.shape}")
    print("\nComparison of first vs last occurrence keeping:")
    print(pd.concat([
        df[df.duplicated(keep=False)].groupby(list(df.columns)).first(),
        df[df.duplicated(keep=False)].groupby(list(df.columns)).last()
    ], axis=1, keys=['First Occurrence', 'Last Occurrence']))
    df_last.to_csv('deduplicated_last.csv', index=False)
    print("\nSaved as 'deduplicated_last.csv'")
    
    return df_first, df_subset, df_last

# Example Usage
if __name__ == "__main__":
    # Create sample data (in practice, use your actual file path)
    data = {
        'CustomerID': [101, 102, 103, 101, 104, 102, 105, 101],
        'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Bob', 'Eve', 'Alice'],
        'Email': ['a@test.com', 'b@test.com', 'c@test.com', 'a@test.com', 
                 'd@test.com', 'b@test.com', 'e@test.com', 'a_new@test.com'],
        'JoinDate': ['2023-01-01', '2023-02-01', '2023-03-01', '2023-01-15',
                    '2023-04-01', '2023-02-15', '2023-05-01', '2023-01-20'],
        'PurchaseAmount': [100, 150, 200, 120, 175, 160, 225, 110]
    }
    sample_df = pd.DataFrame(data)
    sample_df.to_csv('customer_data.csv', index=False)
    
    # Run deduplication demonstrations
    df_first, df_subset, df_last = demonstrate_deduplication('customer_data.csv')







Original dataset shape: (8, 5)
Sample data:
   CustomerID     Name       Email    JoinDate  PurchaseAmount
0         101    Alice  a@test.com  2023-01-01             100
1         102      Bob  b@test.com  2023-02-01             150
2         103  Charlie  c@test.com  2023-03-01             200
3         101    Alice  a@test.com  2023-01-15             120
4         104    David  d@test.com  2023-04-01             175

=== 10. Complete Deduplication (Keep First) ===
Removed 0 duplicate rows
Resulting dataset shape: (8, 5)
Saved as 'deduplicated_first.csv'

=== 11. Subset Deduplication (Columns: Email) ===
Removed 2 duplicate rows based on columns: ['Email']
Resulting dataset shape: (6, 5)
Saved as 'deduplicated_subset.csv'

=== 12. Complete Deduplication (Keep Last) ===
Removed 0 duplicate rows
Resulting dataset shape: (8, 5)

Comparison of first vs last occurrence keeping:
Empty DataFrame
Columns: []
Index: []

Saved as 'deduplicated_last.csv'
