In [1]:
import pandas as pd 

In [2]:
url = 'https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/u.user'

cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code' ]

users = pd.read_table(url, sep='|', names=cols)
users.set_index('user_id')

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [3]:
users.shape

(943, 5)

In [5]:
# If we want to identify duplicate zip_code rows
# use df.cat_name.duplicated()
# output True if row above is the same

users.zip_code.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
938    False
939     True
940    False
941    False
942    False
Name: zip_code, Length: 943, dtype: bool

In [7]:
type(users['zip_code'].duplicated())

pandas.core.series.Series

In [8]:
# we can use .count() since it's a series
# there're 148 duplicates

users['zip_code'].duplicated().sum()

148

In [9]:
# it will output True if entire row is duplicated (row above)
users.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
938    False
939    False
940    False
941    False
942    False
Length: 943, dtype: bool

In [10]:
users.duplicated().sum()

0

In [27]:
users.loc[users.duplicated(), :]

Unnamed: 0,user_id,age,gender,occupation,zip_code


In [13]:
# examine duplicated rows

users.loc[users.zip_code.duplicated(), :]

Unnamed: 0,user_id,age,gender,occupation,zip_code
28,29,41,M,programmer,94043
51,52,18,F,student,55105
55,56,25,M,librarian,46260
83,84,32,M,executive,55369
91,92,32,M,entertainment,80525
...,...,...,...,...,...
927,928,21,M,student,55408
928,929,44,M,scientist,53711
933,934,61,M,engineer,22902
936,937,48,M,educator,98072


In [15]:
# keep='first'
# mark duplicates as True except for the first occurence

users.loc[users.zip_code.duplicated(keep='first'), :]

Unnamed: 0,user_id,age,gender,occupation,zip_code
28,29,41,M,programmer,94043
51,52,18,F,student,55105
55,56,25,M,librarian,46260
83,84,32,M,executive,55369
91,92,32,M,entertainment,80525
...,...,...,...,...,...
927,928,21,M,student,55408
928,929,44,M,scientist,53711
933,934,61,M,engineer,22902
936,937,48,M,educator,98072


In [16]:
# keep='last'
# 7 rows that are counted as duplicates, keeping the later one
# this is useful for splitting the data

users.loc[users.zip_code.duplicated(keep='last'), :]

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
6,7,57,M,administrator,91344
13,14,45,M,scientist,55106
14,15,49,F,educator,97301
...,...,...,...,...,...
799,800,25,M,programmer,55337
803,804,39,M,educator,61820
809,810,55,F,other,80526
853,854,29,F,student,55408


In [17]:
# mark all duplicates as True
# this combines the two tables above

users.loc[users.zip_code.duplicated(keep=False), :]

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
6,7,57,M,administrator,91344
13,14,45,M,scientist,55106
14,15,49,F,educator,97301
...,...,...,...,...,...
927,928,21,M,student,55408
928,929,44,M,scientist,53711
933,934,61,M,engineer,22902
936,937,48,M,educator,98072


In [24]:
# Dropping duplicates
# drops the first version of duplicates 

users.zip_code.drop_duplicates(keep='first').shape

(795,)

In [25]:
# drops the last version of duplicates 
users.zip_code.drop_duplicates(keep='last').shape

(795,)

In [26]:
# drops all zip_code duplicates
users.zip_code.drop_duplicates(keep=False).shape

(693,)