# CONTENTS

- IMPORTING DATA
- EXPLORING DATA
- GROUPING AND AGGREGATING DATA
- SPENDING HABITS ANALYSIS
- EXPORTING DATA

# IMPORTING DATA

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# initialize path variable
path = r'/Users/smac_/Documents/Courses/CF Data Analytics Program/Data Immersion/Achievement 4/05-2023 Instacart Basket Analysis'

In [3]:
# import data
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'd_orders_products_derived.pkl'))

# EXPLORING DATA

In [4]:
# column names and data types
ords_prods_merge.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
product_id                 object
add_to_cart_order           int64
reordered                  object
product_name               object
aisle_id                   object
department_id              object
prices                    float64
price_range                object
busiest_day                object
busiest_days               object
busiest_period_of_day      object
dtype: object

In [5]:
# first 5 rows
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Fewest orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Fewest orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders


In [6]:
# last 5 rows
ords_prods_merge.tail()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day
32404854,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range product,Regularly busy,Regularly busy,Most orders
32404855,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range product,Regularly busy,Regularly busy,Most orders
32404856,758936,203436,1,2,7,,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,Mid-range product,Regularly busy,Regularly busy,Fewest orders
32404857,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,Mid-range product,Regularly busy,Least busy,Fewest orders
32404858,3093936,205420,1,4,14,,28818,8,0,Hot Oatmeal Multigrain Raisin,130,14,10.3,Mid-range product,Least busy,Least busy,Most orders


In [7]:
# dataframe size
ords_prods_merge.shape

(32404859, 17)

# GROUPING & AGGREGATING DATA

## COMPARE AGGREGATED VALUES BETWEEN SUBSET DATAFRAME AND FULL DATAFRAME

In [8]:
# aggregate the mean from 'order_number' grouped by 'department_id'
ords_prods_merge.groupby('department_id').agg({'order_number' : ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
10,20.197148
11,16.170638
12,15.887671
13,16.583536
14,16.773669
15,16.165037
16,17.665606
17,15.694469
18,19.310397


The results of the calculation on the entire dataframe show additional `'department_id'` values that were not included in the subset dataframe due to its size limitations. Additionally, the mean values of `'order_number'` for each `'department_id'` in the entire dataframe are slightly different when compared to the subset dataframe.

## CREATE A LOYALTY FLAG BASED ON MAXIMUN NUMBER OF ORDERS FOR EACH CUSTOMER

In [9]:
# add aggregated column to dataframe with maximum values from 'order_number' grouped by 'user_id'
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

### CRITERIA

- If the maximum orders the user has made is over 40, then the customer will be labeled a “Loyal customer.”
- If the maximum orders the user has made is over 10 but less than or equal to 40, then the customer will be labeled a “Regular customer.”
- If the maximum orders the user has made is less than or equal to 10, then the customer will be labeled a “New customer.”

In [10]:
# add flag column to dataframe based on 'max_order' values using conditional statements

In [11]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [12]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [13]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [14]:
# frequency counts in 'loyalty_flag'
ords_prods_merge['loyalty_flag'].value_counts()

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

## CREATE SPENDING FLAG BASED ON AVERAGE PRODUCT PRICE ACROSS ALL ORDERS FROM EACH CUSTOMER

In [15]:
# add aggregated column to dataframe with mean values from 'price' groupeed by 'user_id'
ords_prods_merge['mean_spending'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.mean)

### CRITERIA

- If the mean of the prices of products purchased by a user is lower than 10, then flag them as a “Low spender.”
- If the mean of the prices of products purchased by a user is higher than or equal to 10, then flag them as a “High spender.”

In [16]:
# add flag column to dataframe based on 'mean_spending' values using conditional statements

In [17]:
ords_prods_merge.loc[ords_prods_merge['mean_spending'] < 10, 'spending_flag'] = 'Low spender'

In [18]:
ords_prods_merge.loc[ords_prods_merge['mean_spending'] >= 10, 'spending_flag'] = 'High spender'

In [19]:
# frequency counts in 'spending_flag'
ords_prods_merge['spending_flag'].value_counts()

Low spender     32285150
High spender      119709
Name: spending_flag, dtype: int64

## CREATE FREQUENCY FLAG BASED ON MEDIAN NUMBER OF DAYS THAT HAVE PASSED SINCE EACH CUSTOMER MADE THEIR LAST ORDER

In [20]:
# add aggregated column to dataframe with median values from 'days_since_prior_order' grouped by 'user_id'
ords_prods_merge['median_frequency'] = ords_prods_merge.groupby(['user_id'])['days_since_prior_order'].transform(np.median)

### CRITERIA

- If the median of “days_since_prior_order” is higher than 20, then the customer should be labeled a “Non-frequent customer.”
- If the median is higher than 10 and lower than or equal to 20, then the customer should be labeled a “Regular customer.”
- If the median is lower than or equal to 10, then the customer should be labeled a “Frequent customer.”


In [21]:
# add flag column to dataframe based on 'median_frequency' values using conditional statements

In [22]:
ords_prods_merge.loc[ords_prods_merge['median_frequency'] > 20, 'frequency_flag'] = 'Non-frequent customer'

In [23]:
ords_prods_merge.loc[(ords_prods_merge['median_frequency'] <= 20) & (ords_prods_merge['median_frequency'] > 10), 'frequency_flag'] = 'Regular customer'

In [24]:
ords_prods_merge.loc[ords_prods_merge['median_frequency'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [25]:
# frequency counts in 'frequency_flag'
ords_prods_merge['frequency_flag'].value_counts()

Frequent customer        21559853
Regular customer          7208564
Non-frequent customer     3636437
Name: frequency_flag, dtype: int64

# SPENDING HABITS ANALYSIS

In [26]:
# display first 5 rows
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spending,spending_flag,median_frequency,frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy,Fewest orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy,Fewest orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [27]:
# calculate basic stats in 'prices' grouped by 'loyalty_flag'
ords_prods_merge.groupby('loyalty_flag').agg({'prices' : ['min', 'max', 'mean', 'std']})

Unnamed: 0_level_0,prices,prices,prices,prices
Unnamed: 0_level_1,min,max,mean,std
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Loyal customer,1.0,25.0,7.773526,4.205881
New customer,1.0,25.0,7.801132,4.266445
Regular customer,1.0,25.0,7.798195,4.254534


The summary statistics for all three customer types appear to be remarkably similar, indicating no significant differences in their spending habits. However, it is important to note that the relatively high standard deviation implies that the mean value might not be fully representative of the overall trends.

# EXPORTING DATA

In [None]:
# export manipulated dataframe as pickle
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'e_orders_products_grouped.pkl'))