# Contents

- library import
- data import, products and orders
- descriptive statistics
- review mixed type data
- check for missing values 
- check for duplicates
- exercise task 

# Library and DF Import

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import os

In [3]:
# create a string for the project folder path

path = r'/Users/Ashley/Desktop/Adam/Achievement 4/Instacart Analysis'

In [4]:
# import product data from original data

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [6]:
# import order data from prepared data

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# What are data consistency checks?

In [8]:
# use describe to start looking at the data

df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


# Mixed Type Data

In [9]:
# create a dataframe

df_test = pd.DataFrame()

In [10]:
# create a mixed type column

df_test['mix'] =['a', 'b', 1, True]

In [11]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [12]:
# function to check for mixed type columns

for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print (col)

mix


In [13]:
# change the data type to string, if changing to numeric swap out str for int64

df_test['mix'] = df_test['mix'].astype('str')

# Missing Values

In [14]:
#check for missing values with the is null function

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [17]:
# create a new df containing only the null values

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [18]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


code to replace missing data with the mean:

df['column with missings'].fillna(mean value, inplace=True)


code to replace missing data with median:

df['column with missings'].fillna(median value, inplace=True)

In [20]:
#establish how many rows are currently in the DF

df_prods.shape

(49693, 5)

In [21]:
#create a new DF with null values removed

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [22]:
#check to verify the number of rows that were dropped

df_prods_clean.shape

(49677, 5)

In [23]:
49693-49677

16

Another way you can drop all missing values is via the following command:

df_prods.dropna(inplace = True)
If you wanted to use this command to drop only the NaNs from a particular column, the code would look like this:

df_prods.dropna(subset = [‘product_name’, inplace = True)

# Duplicates

In [24]:
# check for full duplicates

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [25]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [27]:
df_dups.shape

(5, 5)

In [28]:
df_prods_clean.shape

(49677, 5)

In [31]:
# create a new DF with duplicates removed

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [30]:
df_prods_clean_no_dups.shape

(49672, 5)

# Export the cleaned product data frame

In [32]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

# ----- Task 4.5 -----

## Questions 1 and 2

In [33]:
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


All the columns have a minimum of one. For the ID columns this would be expected, for the price column this is reasonable. No negative min values. 
Max values indicate 134 aisle's and 21 different departments. This isn't entirely unreasonable but the number of aisle's could stand to be double checked. 
The product ID category has a max that is 5 lower than it's count. Since we would expected a unique product ID for each row this would be a clue that there may be 5 missing or duplicate values (as was address during the exercise above). 
The max value of 99,999 for price seems like it is either an outlier or something that was entered in place of missing data, either way it is likely not accurate based on the other data points and would need further examining. 

## Questions 3 and 4

In [39]:
#check for mixed type data

for col in df_ords.columns.tolist():
    strange = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[strange]) > 0:
        print(col)

No mixed type data was found in the orders data frame

## Question 5

In [41]:
# check missing values

df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

This shows that the only column with null values is the "days since prior order" column which has 206,209 null values. Given the scale of the missing information it would be best to go back to the client and get further information  

In [42]:
#creating a new DF to view just the rows with the null values

df_ords_null = df_ords[df_ords['days_since_prior_order'].isnull() == True] 

In [43]:
df_ords_null

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,prior,1,4,12,
3420934,3420934,3189322,206206,prior,1,3,18,
3421002,3421002,2166133,206207,prior,1,6,19,
3421019,3421019,2227043,206208,prior,1,1,15,


In [45]:
#checking the descriptive statistics of the null data set

df_ords_null.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,206209.0,206209.0,206209.0,206209.0,206209.0,206209.0,0.0
mean,1712637.0,1708462.0,103105.0,1.0,2.754118,13.626597,
std,987481.6,988129.9,59527.555167,0.0,2.076205,4.223769,
min,0.0,20.0,1.0,1.0,0.0,0.0,
25%,857973.0,850730.0,51553.0,1.0,1.0,11.0,
50%,1717068.0,1706246.0,103105.0,1.0,3.0,14.0,
75%,2570518.0,2564292.0,154657.0,1.0,5.0,17.0,
max,3421069.0,3421081.0,206209.0,1.0,6.0,23.0,


The initial check shows that the only column with null values is days since prior order. I created a new data frame with just these null value rows and noticed that the order number for all of them is "1". I then ran the describe function and confirmed the only order number value for all 206K+ rows is "1". This would indicate that this was the first order placed with instacart for each customer. So at the time of the order there is no valid entry for days since prior order since they had never had a prior order at that point. 

## Question 6

To address the missing data I would propose creating a new column with a boolean for whether or not this is a first time order. If it is a first time order (i.e. if days since prior order is null) then the new column will show true, if days since prior order is not null the new column will show false. When performing future analysis on the days since prior order column we would be able to filter results to only return rows that where "first order" is false. Another option would be to create a new data frame without the null value rows but this would ultimately use up a lot more CPU capacity to maintain both of these data frames and switch between them as needed.

In [51]:
# add a column to df_ords called "first_order" with a boolean showing true if days since prior order is null,
# otherwise false

df_ords['first_order'] = df_ords['days_since_prior_order'].isnull() == True

In [52]:
#view the data frame to ensure the column was added

df_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,0,2539329,1,prior,1,2,8,,True
1,1,2398795,1,prior,2,3,7,15.0,False
2,2,473747,1,prior,3,3,12,21.0,False
3,3,2254736,1,prior,4,4,7,29.0,False
4,4,431534,1,prior,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,prior,10,5,18,29.0,False
3421079,3421079,1854736,206209,prior,11,4,10,30.0,False
3421080,3421080,626363,206209,prior,12,1,12,18.0,False
3421081,3421081,2977660,206209,prior,13,1,12,7.0,False


## Questions 7 and 8

In [53]:
# create a new DF to check for duplicate values

df_ords_dups = df_ords[df_ords.duplicated()]

In [55]:
# view the newly created DF

df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order


There are no duplicate values in the df_ords data frame. 

## Question 9

In [56]:
# export the cleaned orders data

df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))