# How do I find and remove duplicated rows in pandas?

In [1]:
import pandas as pd

In [2]:
user_cols = ['user_id','age','gender','occupation','zip_code']
users = pd.read_table('http://bit.ly/movieusers',sep='|',header=None,names=user_cols,index_col='user_id')

In [3]:
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [9]:
# The pandas.DataFrame.Series.duplicated() method indicates the duplicate Series values.
print(users.shape) # printing the dimensions of users
users.zip_code.duplicated().sum() # summing up the duplicates

(943, 4)


148

In [10]:
# The pandas.DataFrame.duplicated() method indicates the duplicate rows.
print(users.shape) # printing the dimensions of users
users.duplicated().sum() # summing up the duplicates

(943, 4)


7

In [11]:
# We can acces the duplicate rows with .loc[]
users.loc[users.duplicated()] # By default, for each set of duplicated values, the first occurrence is set on False
# we could keep the last rows with keep='last' or drop them all with keep=False

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [13]:
# Dropping the duplicates
print(users.shape) # printing the dimensions of users
users.drop_duplicates().shape # Showing the dimensions after dropping the duplicates

(943, 4)


(936, 4)

In [14]:
# Optionally we can only consider certain columns when dropping duplicates with the subset=[] parameter
print(users.shape) # printing the dimensions of users
users.drop_duplicates(subset=['age','zip_code']).shape # Showing the dimmensions after dropping the duplicates

(943, 4)


(927, 4)

In [15]:
# By default, the inplace paramenter of .drop_duplicates() is set False so it returns a copy.
users.shape # same dimensions

(943, 4)