# Identifying Dupliacte Rows

In [1]:
import numpy as np
import pandas as pd

In [3]:
product_name = ['Dairy', 'Dairy', 'Dairy', 'Vegetables', 'Fruits']
product_price = [2.56, 2.56, 4.55, 2.74, 5.44]

product_df = pd.DataFrame({'product':product_name, 'price':product_price})
product_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetables,2.74
4,Fruits,5.44


### Quick spot check for duplicate values in rows

In [5]:
product_df.shape # One way to quickly scan for dupliactes would be to compare the number of rows to the nuber of unique values in each column.
                 # We can see that the .shape of product_df has 5 rows

(5, 2)

In [6]:
product_df.nunique() # But the unique values shows that the product column only has 3 unique values.
                     # This will show repetition amongst our rows

product    3
price      4
dtype: int64

### The .duplicated() method identifies duplicate rows of data
* Specify subset=columns(s) to look for duplicates accross a subset of columns

In [7]:
product_df.duplicated() # The .duplicated() method returns TRUE for the second row here becuase it is a duplicate of the first row
                        # Notice that is is fully duplicated including the values of the price column while row 2 returns False

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [8]:
product_df.duplicated(subset='product') # Specifying subset='product' will only look for duplicates in that column.
                                        # In this case rows 2 and 3 are duplicates of row 1("Dairy")

0    False
1     True
2     True
3    False
4    False
dtype: bool

### The .drop_duplicates() method drops duplicate rows from a DataFrame
* Specify subset=columns(s) to look for duplicates across a subset of columns

In [9]:
product_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetables,2.74
4,Fruits,5.44


In [12]:
product_df.drop_duplicates() # We can see that the second row is dropped and row 3 remains as it contains unique values in the row. 
                             # Also note that the row index now has a gap between 0 & 2

Unnamed: 0,product,price
0,Dairy,2.56
2,Dairy,4.55
3,Vegetables,2.74
4,Fruits,5.44


### We can be more specific with this method

In [26]:
'''
Because we specified subset="product" we're now judging a row to be a duplicate row if "product"
has duplicate values. Now the shape has gone from 5 rows to 3 rows and we only have one
unique Dairy value.

"keep"= tells the method to keep the instance of the duplicate described as "first" or "last". By default this would be "first".
We can see that it dropped the first two rows because the last dupicate was specified.
Try changing "last" to "first" and compare the results.

"ignore_index=TRUE" will reset our index so there are no gaps.

'''
product_df.drop_duplicates(subset="product", keep="last", ignore_index=True)

Unnamed: 0,product,price
0,Dairy,4.55
1,Vegetables,2.74
2,Fruits,5.44
