# Finding and removing  duplicate  rows 

## Import Library

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
# read a dataset  of movie  review 
col_name = ['user_id', 'age','gender', 'occuation ', 'zip_code']
url = 'https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/u.user'

data = pd.read_csv(url, sep='|', index_col='user_id', names=col_name)

In [3]:
data.head()

Unnamed: 0_level_0,age,gender,occuation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [4]:
data.shape

(943, 4)

### Check  duplicate  in a column 

In [5]:
### output  true  if items  is identical to previous items
data['zip_code'].duplicated()

user_id
1      False
2      False
3      False
4      False
5      False
       ...  
939    False
940     True
941    False
942    False
943    False
Name: zip_code, Length: 943, dtype: bool

In [6]:
# check the  number  of duplicate (True become 1, False  becomes 0)
d_zipcode = data['zip_code'].duplicated().sum()

print('Total no of duplicate:', d_zipcode )

Total no of duplicate: 148


In [7]:
# output  true   if entire  row is identical  to the previous  one
data.duplicated().tail()

user_id
939    False
940    False
941    False
942    False
943    False
dtype: bool

In [8]:
# count the duplicate  row 
d_rows = data.duplicated().sum()
print('Total no of duplicate  rows --->', d_rows)

Total no of duplicate  rows ---> 7


In [9]:
# check  duplicate  rows 
data.loc[data.duplicated(),:]

Unnamed: 0_level_0,age,gender,occuation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


#### Logic for duplicated:

- keep='first' (default): Mark duplicates as True except for the first occurrence.
- keep='last': Mark duplicates as True except for the last occurrence.
- keep=False: Mark all duplicates as True.

In [10]:
# check  the  duplicated  rows  (except the first  occurrence)
data.loc[data.duplicated(keep='first'),:]

Unnamed: 0_level_0,age,gender,occuation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [11]:
# check the duplicate rows (ignoring the last occurrence)
data.loc[data.duplicated(keep='last'), :]

Unnamed: 0_level_0,age,gender,occuation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630


In [12]:
# check the duplicate rows (including all duplicates rows)
data.loc[data.duplicated(keep=False), :]

Unnamed: 0_level_0,age,gender,occuation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


### Drop duplicates

In [13]:
# drop the duplicate rows (inplace=False by default)
data.drop_duplicates(keep='first').shape

(936, 4)

In [14]:
# drop the duplicate rows (inplace=False by default)
data.drop_duplicates(keep='last').shape

(936, 4)

In [15]:
# drops all duplicate rows
data.drop_duplicates(keep=False).shape

(929, 4)

In [22]:
# consider a subset of columns when identifying duplicates
no_of_duplicate = data.duplicated(subset=['zip_code','age']).sum()

print('Total no of duplicate rows -->', no_of_duplicate)

Total no of duplicate rows --> 16


In [21]:
# drop subset of duplicate columns 
data.drop_duplicates(subset=['age', 'zip_code']).shape

(927, 4)