# CONTENTS

- IMPORTING DATA
- EXPLORING DATA
- DATA WRANWLING
- DATA ACCURACY CHECKS
- EXPORTING DATA

# IMPORTING DATA

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# initialize path variable
path = r'/Users/smac_/Documents/courses/CF Data Analytics Program/Data Immersion/Achievement 4/05-2023 Instacart Basket Analysis'

In [3]:
# import datasets
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'a_orders_wrangled.csv'))

# EXPLORING DATA

## `'df_prods'` DATAFRAME

In [4]:
# first 5 rows
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [5]:
# last 5 rows
df_prods.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49692,49688,Fresh Foaming Cleanser,73,11,13.5


In [6]:
# column names and data types
df_prods.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [7]:
# dataframe size
df_prods.shape

(49693, 5)

## `'df_ords'` DATAFRAME

In [8]:
# first 5 rows
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [9]:
# last 5 rows
df_ords.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0
3421082,272231,206209,14,6,14,30.0


In [10]:
# column names & data types
df_ords.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [11]:
# dataframe size
df_ords.shape

(3421083, 6)

# DATA WRANGLING

## `'df_prods'` DATAFRAME

In [12]:
# cast nominal columns as string
df_prods['product_id'] = df_prods['product_id'].astype(str)
df_prods['aisle_id'] = df_prods['aisle_id'].astype(str)
df_prods['department_id'] = df_prods['department_id'].astype(str)

## `'df_ords'` DATAFRAME

In [13]:
# cast nominal columns as string
df_ords['order_id'] = df_ords['order_id'].astype(str)
df_ords['user_id'] = df_ords['user_id'].astype(str)


# DATA INTEGRITY CHECKS

## ACCURACY

### `'df_prods'` DATAFRAME

In [14]:
# summary stats
df_prods.describe()

Unnamed: 0,prices
count,49693.0
mean,9.994136
std,453.519686
min,1.0
25%,4.1
50%,7.1
75%,11.2
max,99999.0


The maximum value significantly deviates from the expected range of values, indicating a potential anomaly. Further investigation is necessary to understand the underlying cause.

### `'df_ords'` DATAFRAME

In [15]:
# summary stats
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


The column 'orders_day_of_week' indicates the numeric representation (ranging from 0 to 6) of the day of the week when orders were placed. Despite the initial appearance, it is noteworthy that there are still a **total of 7 unique values**, aligning with the actual number of days in a week. The Instacart project brief provides further information on this matter.

In the 'order_hour_of_day' column, the hours are presented in a **24-hour format**, as opposed to a 12-hour format. This practice ensures that the column exclusively contains numeric values, enabling calculations to be performed seamlessly.

The column *'days_since_prior_order'* indicates the maximum value of '30', suggesting a **30-day limit** imposed by the data collection system. Conversely, its minimum value is '0', implying that **some customers placed an order on the same day the data was extracted**. 


## CONSISTENCY

### `'df_prods'` DATAFRAME

#### MIXED-TYPE DATA

In [16]:
# identify mixed-type columns
for col in df_prods.columns.tolist():
  weird = (df_prods[[col]].applymap(type) != df_prods[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods[weird]) > 0:
    print (col)

product_name


The `product_name` column was identified as mixed-type. It will be addressed after dealing with missing values.

#### MISSING VALUES

In [17]:
# identify missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [18]:
# initialize subset dataframe containing rows with missing values in 'product_name'
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [19]:
# 'df_nan' dataframe size
df_nan.shape

(16, 5)

In [20]:
# all rows in 'df_nan' dataframe
df_nan.head(16)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


The `product_name` column contains a total of 16 missing values. The absence of any discernible pattern suggests that these missing values are likely a result of errors during the data collection process.

In [21]:
# 'df_prods' dataframe size
df_prods.shape

(49693, 5)

In [22]:
# initialize subset dataframe excluding rows with missing values in 'product_name' 
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [23]:
# 'df_prods_clean' dataframe size
df_prods_clean.shape

(49677, 5)

To address the missing values, a new dataframe was created by including all the observations that do not have missing values. This approach was implemented to prevent any overwrite of the original data.

In [24]:
# identify mixed-type columns in 'df_prods_clean' dataframe
for col in df_prods_clean.columns.tolist():
  weird = (df_prods_clean[[col]].applymap(type) != df_prods_clean[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods_clean[weird]) > 0:
    print (col)

No mixed-type data columns were identified after addressing missing values.

#### DUPLICATE VALUES

In [25]:
# identify full duplicates
print(df_prods[df_prods.duplicated()])

      product_id                                       product_name aisle_id  \
462          462                  Fiber 4g Gummy Dietary Supplement       70   
18459      18458                                         Ranger IPA       27   
26810      26808               Black House Coffee Roasty Stout Beer       27   
35309      35306  Gluten Free Organic Peanut Butter & Chocolate ...      121   
35495      35491                            Adore Forever Body Wash      127   

      department_id  prices  
462              11     4.8  
18459             5     9.2  
26810             5    13.4  
35309            14     6.8  
35495            11     9.9  


In [26]:
# initialize subset dataframe containing full duplicates
df_prods_clean_dups = df_prods_clean[df_prods_clean.duplicated()]

In [27]:
# first 5 rows in 'df_prods_clean' dataframe
df_prods_clean_dups.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [28]:
# 'df_prods_clean' dataframe size
df_prods_clean.shape

(49677, 5)

In [29]:
# remove full duplicates in 'df_prods_clean' dataframe
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [30]:
# 'df_prods_clean_no_dups' dataframe size
df_prods_clean_no_dups.shape

(49672, 5)

To handle duplicate values, they were removed from the clean dataframe and added to a separate final dataframe. This approach was employed to prevent any accidental modification of the original data.

### `'df_ords'` DATAFRAME

#### MIXED-TYPE DATA

In [31]:
# check for mixed-type data columns in the 'df_ords' dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

No mixed-type data columns were found.

#### MISSING VALUES

In [32]:
# identify missing values
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

Upon examining the consistency of the `days_since_prior_order` column, a **considerable proportion of 171,952 missing values** were discovered. This raises the possibility that these missing values correspond to new customers who haven't made any orders yet, resulting in no value being assigned to them in this column.

If the missing values have meaningful implications, it is recommended to retain them in the dataframe as they could hold relevance for the analysis. By preserving these values, valuable insights or patterns that may contribute to the analysis can be effectively captured.

#### DUPLICATE VALUES

In [33]:
# identify full duplicates
print(df_ords[df_ords.duplicated()])


Empty DataFrame
Columns: [order_id, user_id, order_number, orders_day_of_week, order_hour_of_day, days_since_prior_order]
Index: []


No duplicated rows were found.

# EXPORTING DATA

In [None]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'b_products_checked.csv'), index = False)
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'b_orders_checked.csv'), index = False)