# CONTENTS

- IMPORTING DATA
- EXPLORING DATA
- COMBINING DATA
- EXPORTING DATA

# IMPORTING DATA

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# initialize path variable
path = r'/Users/smac_/Documents/Courses/CF Data Analytics Program/Data Immersion/Achievement 4/05-2023 Instacart Basket Analysis'

In [3]:
# import data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'b_products_checked.csv'))
df_ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'c_orders_products_combined.pkl'))

# EXPLORING DATA

## `'df_prods'` DATAFRAME

In [4]:
# column names & data types
df_prods.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [5]:
# first 5 rows
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
# last 5 rows
df_prods.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49667,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49686,Artisan Baguette,112,3,7.8
49670,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49671,49688,Fresh Foaming Cleanser,73,11,13.5


In [7]:
# dataframe size
df_prods.shape

(49672, 5)

In [8]:
# cast nominal columns as string
df_prods['product_id'] = df_prods['product_id'].astype(str)
df_prods['aisle_id'] = df_prods['aisle_id'].astype(str)
df_prods['department_id'] = df_prods['department_id'].astype(str)


In [9]:
# summary stats
df_prods.describe()

Unnamed: 0,prices
count,49672.0
mean,9.993282
std,453.615536
min,1.0
25%,4.1
50%,7.1
75%,11.1
max,99999.0


## `'df_ords_prods_combined'` DATAFRAME

In [10]:
# column names & data types
df_ords_prods_combined.dtypes

order_id                    object
user_id                     object
order_number                 int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
product_id                  object
add_to_cart_order            int64
reordered                   object
_merge_prods_ords         category
dtype: object

In [11]:
# first 5 rows
df_ords_prods_combined.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge_prods_ords
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [12]:
# last 5 rows
df_ords_prods_combined.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge_prods_ords
32434484,2977660,206209,13,1,12,7.0,14197,5,1,both
32434485,2977660,206209,13,1,12,7.0,38730,6,0,both
32434486,2977660,206209,13,1,12,7.0,31477,7,0,both
32434487,2977660,206209,13,1,12,7.0,6567,8,0,both
32434488,2977660,206209,13,1,12,7.0,22920,9,0,both


In [13]:
# dataframe size
df_ords_prods_combined.shape

(32434489, 10)

In [14]:
# cast nominal columns as string
df_ords_prods_combined['order_id'] = df_ords_prods_combined['order_id'].astype(str)
df_ords_prods_combined['user_id'] = df_ords_prods_combined['user_id'].astype(str)
df_ords_prods_combined['product_id'] = df_ords_prods_combined['product_id'].astype(str)
df_ords_prods_combined['reordered'] = df_ords_prods_combined['reordered'].astype(str)

In [15]:
# summary stats
df_ords_prods_combined.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order
count,32434490.0,32434490.0,32434490.0,30356420.0,32434490.0
mean,17.14205,2.738818,13.42498,11.10407,8.351076
std,17.53504,2.090049,4.246365,8.778914,7.126671
min,1.0,0.0,0.0,0.0,1.0
25%,5.0,1.0,10.0,5.0,3.0
50%,11.0,3.0,13.0,8.0,6.0
75%,24.0,5.0,16.0,15.0,11.0
max,99.0,6.0,23.0,30.0,145.0


# COMBINING DATA

In [16]:
# merge 'df_orders_products_combined' & 'df_prods' dataframes on 'product_id' (matching records only)
df_orders_products_merged = df_ords_prods_combined.merge(df_prods, on = 'product_id', indicator = '_merge_prods_info')

In [17]:
# merge flag frequency counts
df_orders_products_merged['_merge_prods_info'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge_prods_info, dtype: int64

In [18]:
# first 5 rows in 'df_orders_products_merged' dataframe
df_orders_products_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge_prods_ords,product_name,aisle_id,department_id,prices,_merge_prods_info
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,both


In [19]:
# 'df_orders_products_merged' dataframe size
df_orders_products_merged.shape

(32404859, 15)

# EXPORTING DATA

In [None]:
# export 'df_orders_products_merged' dataframe as pickle
df_orders_products_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'c_orders_products_merged.pkl'))