# CONTENTS

- IMPORTING DATA
- EXPLORING DATA
- DATA WRANGLING
- SUBSETTING
- BUSINESS QUESTIONS
- EXPORTING DATA

# IMPORTING DATA

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# initialize path variable
path = r'/Users/smac_/Documents/courses/CF Data Analytics Program/Data Immersion/Achievement 4/05-2023 Instacart Basket Analysis'

In [3]:
# import data
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data','orders.csv'))
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))
df_depts = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'))

# EXPLORING DATA

## `'df_ords'` DATAFRAME

In [4]:
# column names & datatypes
df_ords.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [5]:
# first 5 rows
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
# last 5 rows
df_ords.tail()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0
3421082,272231,206209,train,14,6,14,30.0


In [7]:
# dataframe size
df_ords.shape

(3421083, 7)

## `'df_prods'` DATAFRAME

In [8]:
# column names & data types
df_prods.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [9]:
# first 5 rows
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [10]:
# last 5 rows
df_prods.tail()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49692,49688,Fresh Foaming Cleanser,73,11,13.5


In [11]:
# dataframe size
df_prods.shape

(49693, 5)

## `'df_depts'` DATAFRAME

In [12]:
# column names & data types
df_depts.dtypes

department_id    object
1                object
2                object
3                object
4                object
5                object
6                object
7                object
8                object
9                object
10               object
11               object
12               object
13               object
14               object
15               object
16               object
17               object
18               object
19               object
20               object
21               object
dtype: object

In [13]:
# first 5 rows
df_depts.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [14]:
# last 5 rows
df_depts.tail()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [15]:
# dataframe size
df_depts.shape

(1, 22)

# DATA WRANGLING

## `'df_ords'` DATAFRAME

In [16]:
# drop 'eval_set' column
df_ords = df_ords.drop(columns = ['eval_set'])

In [17]:
# frequency counts in 'days_since_prior_order' (including NaN)
df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [18]:
# rename 'order_dow' column
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [19]:
# rename 'days_since_prior_order' without overwriting dataframe
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'})

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [20]:
# cast nominal columns as string
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

## `'df_depts'` DATAFRAME

In [21]:
# transposing df_depts to long format
df_depts_t = df_depts.T
df_depts_t.reset_index() # add an index
new_header = df_depts_t.iloc[0] # initialize variable containing the row to act as the new header
df_depts_final = df_depts_t [1:] # initialize a new dataframe excluding the header row
df_depts_final.columns = new_header # assign the new header row
df_depts_final

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# SUBSETTING

In [22]:
# initialize data dictionary from 'df_depts_final' dataframe using index as keys for entries
data_dict = df_depts_final.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [23]:
# initialize subset from 'df_prods' dataframe only containing data from the breakfast department
df_bkfst = df_prods[df_prods['department_id'] == 14]
df_bkfst.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


In [24]:
# initialize a subset from the 'df_prods' dataframe ony containing records from the alcohol, deli, beverages, and meat/seafood departments
df_dinner_party_prods = df_prods.loc[df_prods['department_id'].isin([5, 20, 7, 12])]

In [25]:
# count of records in 'df_dinner_party_prods' dataframe
df_dinner_party_prods.shape

(7650, 5)

# BUSINESS QUESTIONS

### WHAT IS THE BUSIEST HOUR FOR PLACING ORDERS AMONGST COSTUMERS?

In [26]:
# frequency counts of 'order_hour_of_day' column
df_ords['order_hour_of_day'].value_counts()

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

The busiest hour for placing orders amongst customers is **10:00am**.

### WHAT IS THE MEANING OF THE VALUE 4 IN `'department_id'`COLUMN FROM `'df_prods'` DATAFRAME?

In [27]:
# retrieve data dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [28]:
# determine the meaning of departmment_id = 4 using data dictionary
print(data_dict.get('4'))

{'department': 'produce'}


The department_id '4' refers to the **Produce** department.

### INVESTIGATE CUSTOMER '1'

In [29]:
# investigate customer with id '1'
df_customer_1 = df_ords[df_ords['user_id'] == '1']
print(df_customer_1)

   order_id user_id  order_number  orders_day_of_week  order_hour_of_day  \
0   2539329       1             1                   2                  8   
1   2398795       1             2                   3                  7   
2    473747       1             3                   3                 12   
3   2254736       1             4                   4                  7   
4    431534       1             5                   4                 15   
5   3367565       1             6                   2                  7   
6    550135       1             7                   1                  9   
7   3108588       1             8                   1                 14   
8   2295261       1             9                   1                 16   
9   2550362       1            10                   4                  8   
10  1187899       1            11                   4                  8   

    days_since_prior_order  
0                      NaN  
1                     15.0  


In [30]:
# cast nominal columns from 'df_customer_1' dataframe as string
df_customer_1['order_id'] = df_customer_1['order_id'].astype(str)
df_customer_1['user_id'] = df_customer_1['user_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_customer_1['order_id'] = df_customer_1['order_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_customer_1['user_id'] = df_customer_1['user_id'].astype(str)


In [31]:
# return descriptive statistics from 'df_customer_1' dataframe
df_customer_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


Customer #1 has made **11 orders** in total. All the orders were made among the days **Sunday and Wednesday**, between **7:00am and 4:00pm**. It waits **19 days on average** before making another order.

# EXPORTING DATA

In [None]:
# export the 'df_ords' & 'df_depts_final' dataframes as CSV
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'a_orders_wrangled.csv'), index = False)
df_depts_final.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'a_departments_wrangled.csv'))