# Consistency checks - Orders

### List of contents
    1. Importing file
    2. Renaming Columns
    3. Changing column type
    4. Missing values?
    5. Duplicates?
    6. Final check
    7. Exporting files

## Importing libraries

In [90]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## Creating path for folder

In [92]:
path = r'/Users/ceciliamoura/Desktop/Career Foundry/Achievement4/Instacart Basket Analysis'

## 1. Importing file

In [94]:
# Importing original file - orders

In [95]:
ord = pd.read_csv(os.path.join (path, 'Data', 'Original Data', 'orders.csv'), index_col = False)

In [96]:
ord.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [97]:
ord.shape

(3421083, 7)

In [98]:
ord.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [99]:
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [100]:
vars_list

['order_id',
 'user_id',
 'order_number',
 'order_dow',
 'order_hour_of_day',
 'days_since_prior_order']

In [101]:
# Leaving column 'eval_set' out of the final df
ords = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), usecols = vars_list)

In [102]:
ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 2. Renaming columns

In [131]:
#'order_dow' becomes 'order_day_of_week'

ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [133]:
#'order_hour_of_day' becomes 'order_time'

ords.rename(columns = {'order_hour_of_day' : 'order_time'}, inplace = True)

In [139]:
# checking changes
ords.head()


Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_time,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 3.Changing column type

In [146]:
ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_day_of_week       int64  
 4   order_time              int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


In [148]:
# 'order_id' changed from int64 to object

ords['order_id'] = ords['order_id'].astype('str')

In [150]:
# 'user_id' changed from int64 to object

ords['user_id'] = ords['user_id'].astype('str')

In [152]:
# checking change
ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 object 
 2   order_number            int64  
 3   order_day_of_week       int64  
 4   order_time              int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 156.6+ MB


## 4. Missing values?

In [155]:
ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_time                     0
days_since_prior_order    206209
dtype: int64

In [159]:
# Creating a subset with missing values

ords_missing = ords[ords['days_since_prior_order'].isnull() == True]

In [161]:
ords_missing

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_time,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [163]:
# If the observation refers to the first user's order, 'days_since_prior_order' should be '0' or 'NaN'
# To check this I will crosstab order_number and days_since_prior_order in this subset

crosstab = pd.crosstab(ords['days_since_prior_order'], ords['order_number'], dropna = False)

In [167]:
crosstab.to_clipboard()

In [173]:
# table checked in excell
# The first order ('order_numbre' = 1) is responsible for all 206209 '0' values of 'days_since_prior_order'
# Conclusion: NaN in this case is not trully a missing values

### 3.1 Addressing NaN values

In [182]:
#I will replace NaN values in 'days_since_prior_order' with '0', since is a relevant value com descriptive statistics

In [190]:
ords_missing.loc[ords_missing['days_since_prior_order'].isnull(), 'days_since_prior_order'] = 0

In [192]:
ords_missing.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_time,days_since_prior_order
0,2539329,1,1,2,8,0.0
11,2168274,2,1,2,11,0.0
26,1374495,3,1,1,14,0.0
39,3343014,4,1,6,11,0.0
45,2717275,5,1,3,12,0.0


In [194]:
# Applying same method to dataframe ords, to address NaN values in 'days_since_prior_order'

ords.loc[ords['days_since_prior_order'].isnull(), 'days_since_prior_order'] = 0

In [196]:
#checking results

ords.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_day_of_week         0
order_time                0
days_since_prior_order    0
dtype: int64

## 5. Duplicates?

In [199]:
dups = ords[ords.duplicated()]

In [201]:
dups

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_time,days_since_prior_order


In [203]:
# No duplicates found

## 6. Final changes

In [210]:
ords.dtypes

order_id                   object
user_id                    object
order_number                int64
order_day_of_week           int64
order_time                  int64
days_since_prior_order    float64
dtype: object

In [212]:
# changing column type of 'days_since_prior_order' into int64

ords['days_since_prior_order'] = ords['days_since_prior_order'].astype('int64')

In [214]:
# checking procedure

ords.dtypes

order_id                  object
user_id                   object
order_number               int64
order_day_of_week          int64
order_time                 int64
days_since_prior_order     int64
dtype: object

## 7. Exporting new file

In [217]:
ords.to_csv(os.path.join(path,'Data','Final Prepared Data', 'orders_clean.csv'))