### 4.5: Data Consistency Checks

In [1]:
# Import libraries
import pandas as pd
import numpy as nm
import os

In [2]:
# Set display options for better viewing
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 100)  # Limit columns
pd.set_option('display.max_rows', 50)      # Limit rows

In [3]:
# Create shortcut for data file
path= r'/Users/anjanpakhrin/Documents/Instacart Basket Analysis'

In [4]:
# Create path to data files
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
# Descriptive Statistics for orders_wrangled.csv
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### Mixed Data Types: Create New dataframe

In [6]:
# create a dataframe
df_test = pd.DataFrame()

In [7]:
# Create a Column with mixed Data Type [Column name: mix, 'a' and 'b': strings, 1: integer, True: bool]
df_test['mix'] = ['a', 'b', 1, True]

In [8]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


#### Mixed Data Types: Check for Mixed Data Types

In [9]:
# Check for mixed data types
for col in df_test.columns.tolist():
    weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
             print('Data type:',(col))

Data type: mix


In [10]:
# Alternative way to check for mixed data types
for col in df_test.columns:
    # Get all the unique types in the column (ignoring missing values)
    unique_types = df_test[col].dropna().map(type).nunique()
    
    # If there is more than 1 unique type, then the column is mixed
    if unique_types > 1:
        print('Data type:', (col))

Data type: mix


#### Mixed Data Types: Fixing Mixed Data Types

In [11]:
# Fixing data types: Set the data type for column
df_test['mix'] = df_test['mix'].astype('str')

In [12]:
# Checking data type for "mix" column
print('Data type after fix:',df_test['mix'].dtype)

Data type after fix: object


#### Missing Values: Finding Missing Values

In [13]:
# Finding missing values in "products.csv"
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

#### Missing Values: Creating Subset of Missing Values

In [14]:
# Creating subset of observations with missing values (NaN values in product_name) 
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


#### Addressing Missing Values
1. Create a new variable that acts like a flag based on the missing value
2. Impute the value with the mean or median of the column (if the variable is numeric)
3. Remove or filter out the missing data

In [16]:
# Return number of rows and columns in df_prods
df_prods.shape

(49693, 5)

In [17]:
# Create new dataframe df_prods_clean (dataframe after cleaning/filtering missing values)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [18]:
df_prods_clean.shape

(49677, 5)

#### Duplicates: Finding duplicates

In [19]:
# Create subset of duplicates within dataframe (look for full duplicates)
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [20]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


#### Addressing Duplicates
Remove the duplicates

In [21]:
# Checking dimensions of df_prods_clean before addressing duplicates
print('Rows, Columns before removing duplicates:',df_prods_clean.shape)

Rows, Columns before removing duplicates: (49677, 5)


In [22]:
# Remove duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [23]:
print('Rows, Columns after removing duplicates:',df_prods_clean_no_dups.shape)

Rows, Columns after removing duplicates: (49672, 5)


In [24]:
# Export cleaned/checked dataframe "products.csv" to "Prepared Data" as "products_checked"
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index = False)

### Task

#### Step 2

In [25]:
# Descriptive statistics "df_ords"
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


1. **order_day_of_week:** Values min = 0(Monday), max = 6(Sunday) give a total of exact 7 values for 7 days of the week. For instance, the column looks clean with other for example, 3 for the 50th percentile representing center of min and max.
2. **order_hour_of_day:** Values seem consistent representing a 24-hour clock (0 = 12:00 AM, 23 = 11:00 PM)
3. **days_since_prio_order:** The **count** for this column (3.21 million) is lower than for all other columns (3.42 million). This indicates there are missing values in this column. The **max** value seems high indicating a 30-day high gap between orders, which might be worth checking if it's a data entry error.
4. **order_numer:** suspiciously max. value (100) in this column might be a data cap. 

#### Step 3: Check mixed-type data in df_ords

In [26]:
# Recall a few first rows of df_ords for visual check
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


Since this few rows couldn't show the expected result if there are any columns existent in the dataframe, further checking with the query is required.

In [27]:
# flag if any mixed type column is found
mixed_type_found = False
# Check for mixed data types
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) > 0:
        print('Mixed data types found in:',(col))
        mixed_type_found = True
# checking for flag
if not mixed_type_found:
    print('No mixed data types found in df_ords.')

No mixed data types found in df_ords.


#### Step 4: No mixed-type data found in dataframe **df_ords**

#### Step 5: Checking for missing values

Finding missing values in **df_ords**
df_ords.isnull().sum()

As elaborated in step 2 there are 206209 missing values in the column **days_since_prior_order.** 
This column represents customer's the **first-ever order**, i.e., there is **no prior order** than
this place by that customer. Therefore, missing values in this column are logical and expected, which
can not be calculated.
To give a visual hint creating a subset of missing values might be helpful.

In [28]:
# Creating a subset of observations with missing values in **days_siince_prior_order.**
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [29]:
# Return a few rows of missing subset dataframe
df_ords_nan.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,


In [30]:
# Sampling the pattern of random users, for example user 20 and 59 to suppor the statement above
df_sample = df_ords[df_ords['user_id'].isin([20, 59])].sort_values(by=['user_id', 'order_number'])

# Displaying the pattern
print(df_sample[['order_id', 'user_id', 'order_number', 'order_day_of_week', 'order_hour_of_day', 
                 'days_since_prior_order']].to_string(index=False))

 order_id  user_id  order_number  order_day_of_week  order_hour_of_day  days_since_prior_order
   947722       20             1                  1                 10                     NaN
   157550       20             2                  3                 16                     2.0
  3408319       20             3                  2                 11                     6.0
  2741696       20             4                  2                 11                     7.0
  1980631       20             5                  1                 11                    30.0
   584586       59             1                  5                 15                     NaN
  1385031       59             2                  1                 10                    24.0
  2149689       59             3                  0                 10                     6.0
  3050891       59             4                  2                 12                     2.0
  2395967       59             5                  

#### Step 6: Addressing missing values
As mentioned in step 5 these missing values are logical and expected, i.e., not the error. Deleting these rows would remove crucial data on customer acquisition. Filling with the mean would be illogical, as first order has no prior order. Using 0 would also mean that the customer placed another order on the exact same day, i.e., customer placed more than one orders on the exact same day. Therefore, creating a flag column would be the most appropriate method, i.e., the new column **first_order** simply keeps track of which orders we filled in.

In [31]:
# Creating a flag for the very-first order
df_ords_flag = df_ords.copy()
df_ords_flag['first_order'] = df_ords_flag['days_since_prior_order'].isnull()

In [32]:
# Recalling first 5 rows for visualizing
df_ords_flag.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


#### Step 6: Addressing missing values - **No Action**
NaN is not missing value in this case, rather represents a very-first order place by the customer, no further action is required.

In [33]:
# Recalling first rows for check
df_ords_flag.head(5)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


#### Step 7: Checking Duplicates

In [34]:
# Subset of duplicates in df_ords_clean
df_ords_dups = df_ords_flag[df_ords_flag.duplicated()]

In [35]:
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order


#### Step 8: No duplicates found --> Therefore, no further action required.
Since no rows were deleted number of rows remains same. However the number of columns would be +1 than in the original products.csv.
If duplicates were found, duplicates could be removed with **df_ords_flag_no_dups = df_ords_flag.drop_duplicates()**.

#### Step 9: Exporting final, cleaned dataframe

In [36]:
# Export cleaned/checked dataframe "products.csv" to "Prepared Data" as "products_checked"
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index = False)

In [37]:
# Export cleaned/checked dataframe "orders.csv" to "Prepared Data" as "orders_checked"
df_ords_flag.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index = False)