# Exercise 4.5: Data Consistency Checks

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

## Loading the Data

In [2]:
# Define the path to the data files
path = '/Users/aaronkibler/CF Project 4 - Instacart Basket Analysis/02 Data'

In [3]:
# Import the “products.csv” data set from the “Original Data” folder as df_prods 
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'))

In [4]:
# Import the “orders_wrangled.csv” data set from your “Prepared Data” folder as df_ords
df_ords = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'))

In [5]:
# Checking "products.csv" data is correctly loaded
print(df_prods.head())
print(df_prods.info())
print(df_prods.shape)

   product_id                                       product_name  aisle_id  \
0           1                         Chocolate Sandwich Cookies        61   
1           2                                   All-Seasons Salt       104   
2           3               Robust Golden Unsweetened Oolong Tea        94   
3           4  Smart Ones Classic Favorites Mini Rigatoni Wit...        38   
4           5                          Green Chile Anytime Sauce         5   

   department_id  prices  
0             19     5.8  
1             13     9.3  
2              7     4.5  
3              1    10.5  
4             13     4.3  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49693 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49693 non-null  int64  
 1   product_name   49677 non-null  object 
 2   aisle_id       49693 non-null  int64  
 3   department_id  49693 non-null  int64

In [6]:
# Checking "orders_wrangled.csv" data is correctly loaded
print(df_ords.head())
print(df_ords.info())
print(df_ords.shape)

   Unnamed: 0  order_id  user_id  order_number  orders_day_of_week  \
0           0   2539329        1             1                   2   
1           1   2398795        1             2                   3   
2           2    473747        1             3                   3   
3           3   2254736        1             4                   4   
4           4    431534        1             5                   4   

   order_hour_of_day  days_since_prior_order  
0                  8                     NaN  
1                  7                    15.0  
2                 12                    21.0  
3                  7                    29.0  
4                 15                    28.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   order_id                int64  
 2   user_id                 int64  


## Mixed-Type Data

In [7]:
# Create a dataframe
df_test = pd.DataFrame()

In [8]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [9]:
# Check the new df_test dataframe
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# Check for any mixed-type columns
for col in df_test.columns.tolist():
    weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print(col)

mix


In [11]:
# Change "mix" column data type to string
df_test['mix'] = df_test['mix'].astype('str')

In [12]:
# Check the results of the change
df_test['mix'].dtype

dtype('O')

## Missing Values

In [13]:
# Look for missing values in df_prods dataframe
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [14]:
# Create a subset of df_prods called "df_nan" that contains only the missing values from the "product_name" column
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### The missing values are strings, so imputation is not possible. We will instead create a new dataframe that excludes the missing values.

### First check the "df_prods.shape" in order to later compare the number of rows in the original dataframe with the number in the new subset once the missing rows have been removed

In [16]:
# Check the rows in "df_prods"
df_prods.shape

(49693, 5)

In [17]:
# Create a new dataframe called df_prods_clean
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [18]:
# Check the rows in the new subset (it should be 16 rows less than the original df_prods)
df_prods_clean.shape

(49677, 5)

## Duplicates

In [19]:
# Check for duplicates in df_prods_clean by creating a new subset that contains only duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [20]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [21]:
# Check the number of rows in "df_prods_clean" before removing the duplicates
df_prods_clean.shape

(49677, 5)

In [22]:
# Create a new dataframe that doesn't include the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [23]:
# Check the number of rows in the new dataframe (it should be 5 less than the original df_prods_clean)
df_prods_clean_no_dups.shape

(49672, 5)

## Exporting the freshly cleaned Products dataframe

In [24]:
# Perform a final check of the dataframe before exporting
print(df_prods_clean_no_dups.head())
print(df_prods_clean_no_dups.info())
print(df_prods_clean_no_dups.shape)

   product_id                                       product_name  aisle_id  \
0           1                         Chocolate Sandwich Cookies        61   
1           2                                   All-Seasons Salt       104   
2           3               Robust Golden Unsweetened Oolong Tea        94   
3           4  Smart Ones Classic Favorites Mini Rigatoni Wit...        38   
4           5                          Green Chile Anytime Sauce         5   

   department_id  prices  
0             19     5.8  
1             13     9.3  
2              7     4.5  
3              1    10.5  
4             13     4.3  
<class 'pandas.core.frame.DataFrame'>
Index: 49672 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int64  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int64  
 3   department_id  49672 non-null  int64  
 4

In [25]:
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Prepared Data', 'products_checked.csv'))

# 4.5 TASK: Data Consistency Checks on the "df_ords" dataframe

In [26]:
# Start by running the "df.describe()" function
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### Days Since Prior Order: The count is less than other columns, which could mean missing values.
#### Order Number: The maximum order number is 100, suggesting that only up to 100 orders per customer are kept in the data.
#### Order Day of the Week: Ranges from 0 to 6, correctly representing a full week.
#### Order Hour of Day: Ranges from 0 to 23, representing each hour of the day.

## Check for mixed-type data in your df_ords dataframe

In [27]:
# Check "df_ords" for any mixed-type columns
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) > 0:
        print (f"Mixed-type data found in column: {col}")
    else: print(f"No mixed-type data in column: {col}")

No mixed-type data in column: Unnamed: 0
No mixed-type data in column: order_id
No mixed-type data in column: user_id
No mixed-type data in column: order_number
No mixed-type data in column: orders_day_of_week
No mixed-type data in column: order_hour_of_day
No mixed-type data in column: days_since_prior_order


In [28]:
# Alternative method for identifying columns with mixed-type data
for col in df_ords.columns:
    if (df_ords[col].map(type).nunique() > 1):
        print(f"Mixed-type data found in column: {col}")
    else:
        print(f"No mixed-type data in column: {col}")

No mixed-type data in column: Unnamed: 0
No mixed-type data in column: order_id
No mixed-type data in column: user_id
No mixed-type data in column: order_number
No mixed-type data in column: orders_day_of_week
No mixed-type data in column: order_hour_of_day
No mixed-type data in column: days_since_prior_order


### "df_ords" appears to have no columns with mixed-type data to correct

## Run a check for missing values in your df_ords dataframe

In [29]:
# Look for missing values in df_ords dataframe
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### The "days_since_prior_order" column has missing values as was suspected after running the "df_ords.describe()" function above.
#### Since "days_since_prior_order" has a MAX value of 30, the missing values could represent customers who haven't placed an order in the last 30 days. Another explanation is the missing value represents customers who haven't placed an order yet.

In [30]:
# To more closely examine the rows with missing values, create a subset containing only those rows
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [31]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


#### It appears that all the "days_since_prior_order" NaNs occur when the "order_number" is 1 (a customer places a first order and since the system has no prior order to reference it fills this column with "NaN").

## Address the missing values using an appropriate method

#### To address the missing values in "days_since_prior_order" (assuming all NaNs correspond to an "order_number" of 1), I will create a new column to flag the first time orders, then count the number of flags to see if it equals the number of "NaN"s in the column.

In [32]:
# Creating a new column to flag first orders with either 1 (it is a first time order) or 0 (it is not a first time order)
df_ords['is_first_order'] = df_ords['days_since_prior_order'].isnull().astype(int)

In [33]:
# Count the number of first time orders
print(df_ords['is_first_order'].sum())

206209


#### This confirms that all first time orders are given "NaN" in the "days_since_prior_order" column.

In [34]:
# Check the top rows to confirm the new "flag" column
print(df_ords.head())

   Unnamed: 0  order_id  user_id  order_number  orders_day_of_week  \
0           0   2539329        1             1                   2   
1           1   2398795        1             2                   3   
2           2    473747        1             3                   3   
3           3   2254736        1             4                   4   
4           4    431534        1             5                   4   

   order_hour_of_day  days_since_prior_order  is_first_order  
0                  8                     NaN               1  
1                  7                    15.0               0  
2                 12                    21.0               0  
3                  7                    29.0               0  
4                 15                    28.0               0  


## Run a check for duplicate values in your df_ords data

In [35]:
# Check for duplicates in df_ords by creating a new subset that contains only duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [36]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,is_first_order


### No duplicate values were found in the dataframe

## Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder

#### Export of the final, cleaned "df_prods" data was already performed above. New csv file created "products_checked.csv"

In [37]:
# Perform a final check of the dataframe before exporting
print(df_ords.head())
print(df_ords.info())
print(df_ords.shape)

   Unnamed: 0  order_id  user_id  order_number  orders_day_of_week  \
0           0   2539329        1             1                   2   
1           1   2398795        1             2                   3   
2           2    473747        1             3                   3   
3           3   2254736        1             4                   4   
4           4    431534        1             5                   4   

   order_hour_of_day  days_since_prior_order  is_first_order  
0                  8                     NaN               1  
1                  7                    15.0               0  
2                 12                    21.0               0  
3                  7                    29.0               0  
4                 15                    28.0               0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0

In [38]:
# Export the final cleaned "df_ords" data as "orders_checked.csv"
df_ords.to_csv(os.path.join(path, 'Prepared Data', 'orders_checked.csv'))

# End of Exercise 4.5