# Products Data Cleaning

## Contents

### 01 Importing Libraries

### 02 Importing Data

### 03 Checking Data

### 04 Wrangling Data

#### 01 Data Types

### 05 Checking Data Consistency

#### 01 Missing Values

#### 02 Duplicate Values

### 06 Exporting Dataframe

## 01 Importing Libraries

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import os

## 02 Importing Data

In [3]:
#creating path
path = r'C:\\Users\\samac\\Instacart Basket Analysis'

In [5]:
#importing orders.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

## 03 Checking Data

In [7]:
#checking df_ords shape
df_prods.shape

(49693, 5)

In [9]:
#checking df_ords records
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


## 04 Wrangling Data

### 01 Data Types

In [12]:
#checking data types in df_prods
df_prods.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [15]:
#changing 'product_id' from int to string
df_prods['product_id'] = df_prods['product_id'].astype('str')

In [17]:
#changing 'aisle_id' from int to string
df_prods['aisle_id'] = df_prods['aisle_id'].astype('str')

In [19]:
#changing 'department_id' from int to string
df_prods['department_id'] = df_prods['department_id'].astype('str')

In [21]:
#rechecking data types in df_prods
df_prods.dtypes

product_id        object
product_name      object
aisle_id          object
department_id     object
prices           float64
dtype: object

## 05 Data Consistency Checks

### 01 Missing Values

In [23]:
#searching for missing values in df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [25]:
# finding null product names
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [27]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [29]:
#creating new df without products with missing names

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [31]:
#checking shape of df_prods_clean
df_prods_clean.shape

(49677, 5)

### 02 Duplicate Values

In [35]:
#searching for duplicate values in df_prods_clean
df_prods_dups = df_prods_clean[df_prods_clean.duplicated()]

df_prods_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [37]:
#dropping duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [39]:
#checking shape of df_prods_clean_no_dups
df_prods_clean_no_dups.shape

(49672, 5)

## 06 Exporting Dataframe

In [41]:
#exporting df_prods_clean_no_dups as final_products_wrangled.pkl
df_prods_clean_no_dups.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'final_products_wrangled.pkl'))