# Find duplicates with `.duplicated()`

### Import Pandas

In [4]:
import pandas as pd

### Read CSV

In [5]:
# read csv and parse the data column
df = pd.read_csv("employee.csv", parse_dates = ["start_date"])
df.head()

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
0,Haleigh,Calderhead,334473,2020-05-09,,True,management
1,Coretta,McEvon,637457,2020-03-20,Male,False,engineering
2,Clarette,Tarbett,977749,2020-11-22,Agender,True,engineering
3,Jaime,Gianneschi,253523,2020-09-02,Bigender,False,marketing
4,Ediva,Skelton,325185,2020-02-04,Female,True,marketing


In [6]:
# sort values by first_name
df.sort_values("first_name", inplace=True)
df.head()

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
872,Aaron,Caddens,583629,2020-10-05,Female,True,marketing
619,Aaron,Pohls,188614,2021-01-29,Male,True,marketing
74,Abbi,Waby,433227,2020-04-13,Polygender,True,marketing
535,Abby,August,280480,2020-08-10,Polygender,True,data analytics
898,Adaline,Cohalan,260075,2020-07-30,Polygender,True,engineering


In [7]:
df["first_name"]

872      Aaron
619      Aaron
74        Abbi
535       Abby
898    Adaline
        ...   
550    Zachary
540       Zara
675    Zebulon
783       Zita
476       Zita
Name: first_name, Length: 1000, dtype: object

In [8]:
df["first_name"].duplicated()

872    False
619     True
74     False
535    False
898    False
       ...  
550    False
540    False
675    False
783    False
476     True
Name: first_name, Length: 1000, dtype: bool

In [10]:
# by default does not return first duplicate instance
df[df["first_name"].duplicated()]

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
619,Aaron,Pohls,188614,2021-01-29,Male,True,marketing
83,Adrianne,Charte,573994,2020-07-13,Male,True,management
850,Aluino,Reyner,419740,2020-06-07,Polygender,False,management
988,Ariana,Culverhouse,901497,2020-12-04,Female,False,engineering
751,Arin,Pealing,190653,2020-12-01,Bigender,True,data analytics
822,Augustine,Yukhnini,374683,2020-05-22,Non-binary,True,marketing
811,Bernie,Asty,864231,2020-04-25,Genderfluid,True,marketing
163,Bethany,Carthew,612868,2020-02-25,Female,True,data analytics
61,Bevon,Chaman,183298,2020-06-29,Genderfluid,False,management
456,Blithe,Gay,338102,2020-02-09,Male,True,marketing


In [12]:
# keep= "first" | "last" | False
all_dupes = df["first_name"].duplicated(keep=False)

In [13]:
df[all_dupes]

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
956,Sutherland,Loadwick,120513,2020-02-12,Polygender,False,engineering
665,Sylvan,Humfrey,947719,2020-08-10,Male,True,engineering
115,Sylvan,Thorneley,697506,2020-07-19,Male,False,engineering
556,Tamqrah,Levick,294374,2020-06-24,Bigender,False,management
491,Tamqrah,MacCard,893986,2020-07-07,Female,False,engineering
406,Towny,Roback,932998,2020-10-05,Female,False,marketing
319,Towny,Tippings,662192,2020-10-07,Female,True,data analytics
229,Tressa,Ixer,625357,2020-10-14,Female,True,engineering
307,Tressa,Pobjoy,611694,2020-08-22,Male,False,engineering
304,Wandie,Elverstone,355554,2021-01-28,Female,False,marketing


In [14]:
df[all_dupes].tail(20)

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
956,Sutherland,Loadwick,120513,2020-02-12,Polygender,False,engineering
665,Sylvan,Humfrey,947719,2020-08-10,Male,True,engineering
115,Sylvan,Thorneley,697506,2020-07-19,Male,False,engineering
556,Tamqrah,Levick,294374,2020-06-24,Bigender,False,management
491,Tamqrah,MacCard,893986,2020-07-07,Female,False,engineering
406,Towny,Roback,932998,2020-10-05,Female,False,marketing
319,Towny,Tippings,662192,2020-10-07,Female,True,data analytics
229,Tressa,Ixer,625357,2020-10-14,Female,True,engineering
307,Tressa,Pobjoy,611694,2020-08-22,Male,False,engineering
304,Wandie,Elverstone,355554,2021-01-28,Female,False,marketing


In [19]:
# reverse values with ~  tilde
mask_dupes = ~df["first_name"].duplicated(keep=False)

In [20]:
mask_dupes

872    False
619    False
74      True
535     True
898     True
       ...  
550     True
540     True
675     True
783    False
476    False
Name: first_name, Length: 1000, dtype: bool

In [21]:
# returns unique values 🎉
df[mask_dupes]

Unnamed: 0,first_name,last_name,salary,start_date,gender,remote,team
74,Abbi,Waby,433227,2020-04-13,Polygender,True,marketing
535,Abby,August,280480,2020-08-10,Polygender,True,data analytics
898,Adaline,Cohalan,260075,2020-07-30,Polygender,True,engineering
47,Adara,Wakes,727063,2020-05-04,Genderfluid,False,management
851,Adham,Bennough,907358,2020-06-24,Genderfluid,True,data analytics
...,...,...,...,...,...,...,...
995,Ysabel,Stollmeyer,649625,2020-10-18,Polygender,False,management
586,Yvette,Vasilischev,300495,2020-03-24,Genderfluid,True,management
550,Zachary,Herity,925381,2020-10-23,Non-binary,False,data analytics
540,Zara,Carbin,848862,2021-01-26,Non-binary,True,data analytics
