In [7]:
# This is a very common task because real-world data is often "dirty" and contains the same record multiple times.

import pandas as pd

data = {
    "Name": ["Bilal", "Afan", "Bilal", "Bilal", "Saqib"],
    "Department": ["IT", "HR", "IT", "IT", "Finance"],
    "Salary": [50000, 60000, 50000, 55000, 45000]
}

df = pd.DataFrame(data)

print("--- Original Dirty Data ---")
print(df)

# Exact Duplicate: Row 0 and Row 2 are 100% identical.
# Partial Duplicate: Row 0 and Row 3 are the same person ("Bilal"), but the salary is different (maybe an old record vs. a new record).

--- Original Dirty Data ---
    Name Department  Salary
0  Bilal         IT   50000
1   Afan         HR   60000
2  Bilal         IT   50000
3  Bilal         IT   55000
4  Saqib    Finance   45000


In [8]:
#      Finding Duplicates (.duplicated())
# Before deleting, you usually want to check what will be removed.
# Logic: Returns True if the row is a repeat, False if it is unique.
# Default Behavior: It looks at ALL columns.

# 1. See the True/False list
print(df.duplicated())

# 2. Filter to see the ACTUAL duplicate rows
duplicates = df[df.duplicated()]
# This will show the 2nd "Bilal" (Row 2) because it is an exact copy of Row 0.
# It will NOT show Row 3, because the Salary (55000) makes it technically unique.

print("\n--- Exact Duplicates Found ---")
print(duplicates)

0    False
1    False
2     True
3    False
4    False
dtype: bool

--- Exact Duplicates Found ---
    Name Department  Salary
2  Bilal         IT   50000


In [9]:
# Removing Exact Duplicates (.drop_duplicates())
# This is the standard cleanup command. It removes rows where every single column matches another row.

# By default, this keeps the First occurrence and deletes the rest.
df_clean = df.drop_duplicates()

print("--- After Removing Exact Matches ---")
print(df_clean)

--- After Removing Exact Matches ---
    Name Department  Salary
0  Bilal         IT   50000
1   Afan         HR   60000
3  Bilal         IT   55000
4  Saqib    Finance   45000


In [10]:
# Removing Partial Duplicates (The subset parameter)
# Scenario: You want a list of unique employees. You don't care if the salary is different; if the Name is the same, it counts as a duplicate to you.

# We use subset=["Column_Name"].

# Check duplicates based ONLY on the Name column
# This effectively removes ALL extra 'Bilal' entries, regardless of salary
df_unique_names = df.drop_duplicates(subset=["Name"], ignore_index=True)

print("--- After Removing Duplicates based on Name ---")
print(df_unique_names)

--- After Removing Duplicates based on Name ---
    Name Department  Salary
0  Bilal         IT   50000
1   Afan         HR   60000
2  Saqib    Finance   45000


In [11]:
# Controlling which one stays (The keep parameter)
# When you delete duplicates, which one survives?
# keep='first' (Default): The one on top stays.
# keep='last': The one at the bottom stays (Useful if the bottom row has the most recent/updated data).
# keep=False: Delete ALL of them (Useful if you want to ban anyone who has a duplicate).
# Example: We want to keep the Last entry for Bilal because that row has his updated salary (55,000).

# first set new dataset because previous data set is updated now: 
emp_data = {
    "Name": ["Bilal", "Afan", "Bilal", "Bilal", "Saqib"],
    "Department": ["IT", "HR", "IT", "IT", "Finance"],
    "Salary": [50000, 60000, 50000, 55000, 45000]
}

dff = pd.DataFrame(emp_data)
# 1. Look only at "Name"
# 2. Keep the LAST occurrence (Row 3)
# 3. Delete the previous ones (Row 0 and Row 2)
dff_latest = dff.drop_duplicates(subset=["Name"], keep='last')

print("--- Keeping only the Last Entry ---")
print(dff_latest)

--- Keeping only the Last Entry ---
    Name Department  Salary
1   Afan         HR   60000
3  Bilal         IT   55000
4  Saqib    Finance   45000
