# Project: Instacart Basket Analysis
## Author: Cassy Stunkel
## Task 4.10, Part 1 - Customer Profiling

## Table of Contents
## 01. Import Libraries and Dataset
## 02. Check columns and Reduce DF
## 03. Customer Profiling Based on Age
## 04. Customer Profiling Based on Dependents
## 05. Customer Profiling Based on Income

## 01. Import Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Define path
path = r'/Users/cassystunkel/Documents/Instacart Basket Analysis'

In [3]:
# Import most recent dataset (excluding low-activity customers)
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'no_low_activity.pkl'))

## 02. Check columns and reduce df to avoid RAM issues

In [6]:
# Check df size
df.shape

(24436791, 32)

In [5]:
# Check columns
df.dtypes

product_id                  int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
first_order                  bool
add_to_cart_order           int64
reordered                   int64
Busiest day                object
Busiest days               object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
mean_spend                float64
spend_flag                 object
median_order_frequency    float64
customer_frequency         object
gender                     object
state                      object
age                         int64
date_joined                object
number_of_dependants        int64
fam_status                 object
income        

In [7]:
# Create subset to avoid RAM issues
variables = ['order_id', 'user_id', 'days_since_prior_order', 'department_id', 'prices', 'age', 'number_of_dependants', 'income', 'region']
df_prof = df.loc[:, variables]

In [8]:
# Check dataframe size
df_prof.shape

(24436791, 9)

In [9]:
# Check column heads
df_prof.head()

Unnamed: 0,order_id,user_id,days_since_prior_order,department_id,prices,age,number_of_dependants,income,region
0,3139998,138,3.0,19,5.8,81,1,49620,Midwest
1,1977647,138,20.0,19,5.8,81,1,49620,Midwest
3,2254091,138,6.0,12,20.0,81,1,49620,Midwest
4,505689,138,7.0,10,12.9,81,1,49620,Midwest
5,960220,138,19.0,4,1.7,81,1,49620,Midwest


## 03. Customer profiling based on age

* For the 'age_profile' variable, there will be three categories:
    * Young adult (ages 18-39)
    * Adult (ages 40-64)
    * Senior (ages 65+)

In [11]:
# Evaluate the 'age' column
df_prof['age'].describe()

count    2.443679e+07
mean     4.945715e+01
std      1.848524e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: age, dtype: float64

* Minimum age of 18 and maximum age of 81

In [12]:
# Create age conditions

In [13]:
#If the user is <= 39 years of age, 'age_profile' will reflect 'Young adult'
df_prof.loc[df_prof['age'] <= 39, 'age_profile'] = 'Young adult'

In [14]:
# If the user is between the ages of 40 and 64 (or equal to either of the 2 ages), 'age_profile' will reflect 'Adult'
df_prof.loc[(df_prof['age'] >= 40) & (df_prof['age'] <= 64), 'age_profile'] = 'Adult'

In [15]:
# If the user is of age 65 years or older, 'age_profile' will reflect 'Senior'
df_prof.loc[df_prof['age'] >= 65, 'age_profile'] = 'Senior'

In [16]:
# Check the output of new column 'age_profile'
df_prof['age_profile'].value_counts(dropna = False)

age_profile
Adult          9546341
Young adult    8426468
Senior         6463982
Name: count, dtype: int64

In [18]:
# Check flags
df_prof.groupby('age_profile').agg({'age' : ['min', 'max']})

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,min,max
age_profile,Unnamed: 1_level_2,Unnamed: 2_level_2
Adult,40,64
Senior,65,81
Young adult,18,39


In [19]:
# Check results (head and tail)
df_prof[['age', 'age_profile']].head()

Unnamed: 0,age,age_profile
0,81,Senior
1,81,Senior
3,81,Senior
4,81,Senior
5,81,Senior


In [20]:
df_prof[['age', 'age_profile']].tail()

Unnamed: 0,age,age_profile
32434196,68,Senior
32434197,68,Senior
32434198,68,Senior
32434199,68,Senior
32434200,68,Senior


#### Age profiles correctly assigned to all users

## 04. Customer profiling based on dependents

* For the 'dependents_profile' variable, there will be two categories:
    * No children (0 dependents)
    * Children (1 or more dependents)

In [22]:
# Evaluate the 'number_of_dependants' column
df_prof['number_of_dependants'].describe()

count    2.443679e+07
mean     1.501974e+00
std      1.119177e+00
min      0.000000e+00
25%      0.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      3.000000e+00
Name: number_of_dependants, dtype: float64

* Minimum number of dependents is 0 and maximum is 3

In [23]:
# Create 'dependents_profile' conditions

In [26]:
# If the user's 'number_of_dependants' is equal to 0, 'dependents_profile' will reflect 'No children'
df_prof.loc[df_prof['number_of_dependants'] == 0, 'dependents_profile'] = 'No children'

In [27]:
# If the user's 'number_of_dependants' is greater than or equal to 1, 'dependents_profile' will reflect 'Children'
df_prof.loc[df_prof['number_of_dependants'] >= 1, 'dependents_profile'] = 'Children'

In [28]:
# Check the output of new column 'dependents_profile'
df_prof['dependents_profile'].value_counts(dropna = False)

dependents_profile
Children       18326004
No children     6110787
Name: count, dtype: int64

In [29]:
# Check flags
df_prof.groupby('dependents_profile').agg({'number_of_dependants' : ['min', 'max']})

Unnamed: 0_level_0,number_of_dependants,number_of_dependants
Unnamed: 0_level_1,min,max
dependents_profile,Unnamed: 1_level_2,Unnamed: 2_level_2
Children,1,3
No children,0,0


In [30]:
# Check results (head and tail)
df_prof[['number_of_dependants', 'dependents_profile']].head()

Unnamed: 0,number_of_dependants,dependents_profile
0,1,Children
1,1,Children
3,1,Children
4,1,Children
5,1,Children


In [31]:
df_prof[['number_of_dependants', 'dependents_profile']].tail()

Unnamed: 0,number_of_dependants,dependents_profile
32434196,3,Children
32434197,3,Children
32434198,3,Children
32434199,3,Children
32434200,3,Children


#### Dependents profile correctly assigned to all users

## 05. Customer profiling based on income

* For the 'income_profile' variable, there will be three categories:
    * Income less than or equal to 60K will be assigned 'Low income'
    * Income greater than 60k but less than 250K will be be assigned 'Middle income'
    * Income equal to or greater than 250K will be assigned 'High income'

In [32]:
# Evaluate the 'income' column
df_prof['income'].describe()

count    2.443679e+07
mean     9.989217e+04
std      4.333079e+04
min      2.590300e+04
25%      6.752100e+04
50%      9.683200e+04
75%      1.281600e+05
max      5.939010e+05
Name: income, dtype: float64

* User income ranges from 26K to approximately 600K

In [33]:
# Create 'income_profile' conditions
# If the user's income is less than or equal to 60K, 'income_profile' will reflect 'Low income'
# If the user's income is between 60K and 250K, 'income_profile' will reflect 'Middle income'
# If the user's income is 250K or more, 'income_profile' will reflect 'High income'

result = []
for value in df_prof['income']:
    if value <= 60000:
        result.append('Low income')
    elif value >= 250000:
        result.append('High income')
    else:
        result.append('Middle income')

In [34]:
# Check results
result

['Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low income',
 'Low inco

In [35]:
# Adding results to df
df_prof['income_profile'] = result

In [36]:
# Check results to make sure new column 'income_profile' added
df_prof.head()

Unnamed: 0,order_id,user_id,days_since_prior_order,department_id,prices,age,number_of_dependants,income,region,age_profile,dependents_profile,income_profile
0,3139998,138,3.0,19,5.8,81,1,49620,Midwest,Senior,Children,Low income
1,1977647,138,20.0,19,5.8,81,1,49620,Midwest,Senior,Children,Low income
3,2254091,138,6.0,12,20.0,81,1,49620,Midwest,Senior,Children,Low income
4,505689,138,7.0,10,12.9,81,1,49620,Midwest,Senior,Children,Low income
5,960220,138,19.0,4,1.7,81,1,49620,Midwest,Senior,Children,Low income


In [37]:
# Check output of new column 'income_profile'
df_prof['income_profile'].value_counts(dropna = False)

income_profile
Middle income    19767835
Low income        4519867
High income        149089
Name: count, dtype: int64

In [38]:
# Check flags
df_prof.groupby('income_profile').agg({'income' : ['min', 'max']})

Unnamed: 0_level_0,income,income
Unnamed: 0_level_1,min,max
income_profile,Unnamed: 1_level_2,Unnamed: 2_level_2
High income,250190,593901
Low income,25903,60000
Middle income,60004,249904


In [39]:
# Check results (head and tail)
df_prof[['income', 'income_profile']].head()

Unnamed: 0,income,income_profile
0,49620,Low income
1,49620,Low income
3,49620,Low income
4,49620,Low income
5,49620,Low income


In [40]:
df_prof[['income', 'income_profile']].tail()

Unnamed: 0,income,income_profile
32434196,37867,Low income
32434197,37867,Low income
32434198,37867,Low income
32434199,37867,Low income
32434200,37867,Low income


#### Income profile correctly assigned to all users

In [41]:
# Export dataframe
df_prof.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'customer_profiling.pkl'))