# Orders Data Cleaning

## Contents

### 01 Importing Libraries

### 02 Importing Data

### 03 Checking Data

### 04 Wrangling Data

#### 01 Columns

#### 02 Data Types

### 05 Checking Data Consistency

#### 01 Missing Values

#### 02 Duplicate Values

### 06 Exporting Dataframe

## 01 Importing Libraries

In [11]:
#importing libraries
import pandas as pd
import numpy as np
import os

## 02 Importing Data

In [13]:
#creating path
path = r'C:\\Users\\samac\\Instacart Basket Analysis'

In [15]:
#importing orders.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

## 03 Checking Data

In [27]:
#checking df_ords shape
df_ords.shape

(3421083, 6)

In [18]:
#checking df_ords records
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


## 04 Wrangling Data

### 01 Columns

In [20]:
#dropping 'eval_set' column from df_ords
df_ords = df_ords.drop(columns = ['eval_set'])

In [22]:
#renaming 'order_dow' to 'orders_day_of_week'
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [24]:
#checking column titles
df_ords.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

### 02 Data Types

In [31]:
#checking data types in df_ords
df_ords.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [33]:
#changing 'order_id' from int to string
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [35]:
#changing 'user_id' from int to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [37]:
#rechecking data types in df_ords
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

## 05 Data Consistency Checks

### 01 Missing Values

In [42]:
#searching for missing values in df_ords
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [44]:
#creating flag based on missing values in 'days_since_prior_order' column
df_ords['days_since_prior_order_flag'] = df_ords['days_since_prior_order'].isna().astype(int)

In [46]:
#checking df_ords
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,days_since_prior_order_flag
0,2539329,1,1,2,8,,1
1,2398795,1,2,3,7,15.0,0
2,473747,1,3,3,12,21.0,0
3,2254736,1,4,4,7,29.0,0
4,431534,1,5,4,15,28.0,0


### 02 Duplicate Values

In [53]:
#searching for duplicate values in df_ords
df_ords_dups = df_ords[df_ords.duplicated()]

df_ords_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,days_since_prior_order_flag


No Duplicates Found

## 06 Exporting Dataframe

In [60]:
#exporting df_ords as final_orders_wrangled.pkl
df_ords.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'final_orders_wrangled.pkl'))