1) Import Libraries
2) Import Data
3) Wrangling, Consistency and Quality Checks
    - Dropping date_joined flag
    - renaming columns
        - Surname
        - State
        - First Name
        - Gender
        - Age
    - Checking for Duplicates
    - Checking for Null values
    - Checking for mixed data types
4) Merge Dataframes
    - Merge df_cust with ords_prods_merge
    - Check Null values
    - Check merge with merge flag value counts
    - Drop merge flag
    - Export new combined dataframe

# 01 Import Libraries

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02 Import Data

In [2]:
path = r'C:\Users\benar\CareerFoundry\09-2023 Instacart Basket Analysis'

In [3]:
df_cust = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'customers.csv'))

In [4]:
ords_prods_merge = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_merged_updated_2.pkl'))

In [5]:
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
df_cust.shape

(206209, 10)

# 03 Wrangling, Consistency and Quality Checks

In [7]:
#Checking unique counts for date_joined column
df_cust['date_joined'].describe()

count        206209
unique         1187
top       9/17/2018
freq            213
Name: date_joined, dtype: object

Dropping 'date_joined' column. It will not be useful to the analysis because there are no other columns with dates so there is no useful comparison.

In [8]:
# Dropping date_joined column
df_cust = df_cust.drop(columns = ['date_joined'])

In [9]:
#Checking data types
df_cust.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [10]:
#Checking values of fam_status
df_cust['fam_status'].value_counts()

fam_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: count, dtype: int64

In [11]:
#Rename 'Surnam' column
df_cust.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [12]:
df_cust.head()

Unnamed: 0,user_id,First Name,surname,Gender,STATE,Age,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1,married,40374


In [13]:
df_cust['STATE'].describe()

count      206209
unique         51
top       Florida
freq         4044
Name: STATE, dtype: object

In [14]:
#Rename 'STATE' column
df_cust.rename(columns = {'STATE' : 'state'}, inplace = True)

In [15]:
#Rename First Name, Gender, and Age columns to lowercase to be consistent with ords_prods_merge dataframe.
df_cust.rename(columns = {'First Name' : 'first_name'}, inplace = True)
df_cust.rename(columns = {'Gender' : 'gender'}, inplace = True)
df_cust.rename(columns = {'Age' : 'age'}, inplace = True)

In [16]:
df_cust.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1,married,40374


In [17]:
#Checking for outliers in numeric columns
df_cust.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [18]:
#Checking null values
df_cust.isnull().sum()

user_id             0
first_name      11259
surname             0
gender              0
state               0
age                 0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [19]:
#Checking where First Name column in null...
df_cust[df_cust['first_name'].isnull() == True]

Unnamed: 0,user_id,first_name,surname,gender,state,age,n_dependants,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,2,married,41709
73,13738,,Frost,Female,Louisiana,39,0,single,82518
82,89996,,Dawson,Female,Oregon,52,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1,married,155673
105,29778,,Dawson,Female,Utah,63,3,married,151819
...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,1,married,45275
206162,187532,,Floyd,Female,California,39,0,single,56325


Not dropping these NA values since all the other data is useful and the 'First Name' is not necessary for analysis.

In [20]:
#Checking if 'living with parents and siblings' have dependants...
df_cust[df_cust['fam_status'] == 'living with parents and siblings']

Unnamed: 0,user_id,first_name,surname,gender,state,age,n_dependants,fam_status,income
10,26441,Gloria,Stafford,Female,Nevada,19,2,living with parents and siblings,43443
64,58111,Michelle,Hayes,Female,Nebraska,20,1,living with parents and siblings,45285
91,202887,John,Harris,Male,District of Columbia,20,3,living with parents and siblings,84380
97,180919,Jacqueline,Roach,Female,Nebraska,21,3,living with parents and siblings,56831
135,173192,Anne,Santana,Female,Virginia,21,2,living with parents and siblings,78404
...,...,...,...,...,...,...,...,...,...
206134,143976,Juan,Benson,Male,Arizona,19,2,living with parents and siblings,66098
206143,196680,Jack,Love,Male,New Hampshire,20,2,living with parents and siblings,76384
206145,29801,Patricia,Hart,Female,Kansas,20,1,living with parents and siblings,63370
206152,15246,Brenda,Santiago,Female,Washington,18,2,living with parents and siblings,66108


In [21]:
df_cust['gender'].value_counts()

gender
Male      104067
Female    102142
Name: count, dtype: int64

In [22]:
df_cust['income'].describe()

count    206209.000000
mean      94632.852548
std       42473.786988
min       25903.000000
25%       59874.000000
50%       93547.000000
75%      124244.000000
max      593901.000000
Name: income, dtype: float64

In [23]:
#Checking for duplicates
df_cust[df_cust.duplicated()]

Unnamed: 0,user_id,first_name,surname,gender,state,age,n_dependants,fam_status,income


No duplicates

In [24]:
#Checking for mixed type data
for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

first_name


Makes sense 'First Name' column has mixed type data because the NaN values are different types than the strings in the column.

In [25]:
#Checking data types of ords_prods_merge for merging dataframes
ords_prods_merge.dtypes

product_id                  int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
add_to_cart_order           int64
reordered                   int64
price_range_loc            object
busiest_day                object
busiest_days               object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
mean_spend                float64
spender_flag               object
median_order_frequency    float64
order_frequency_flag       object
dtype: object

'user_id' column is the key between both dataframes. Same datatype

In [26]:
ords_prods_merge.shape

(32404859, 23)

In [27]:
df_cust.shape

(206209, 9)

Documenting shape of dataframes before merge.

# 04 Merge Dataframes

In [28]:
#6 Combine customer data with other instacart data.
df_merge_large = df_cust.merge(ords_prods_merge, on = 'user_id', indicator = True)

In [29]:
df_merge_large.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,n_dependants,fam_status,income,product_id,...,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spend,spender_flag,median_order_frequency,order_frequency_flag,_merge
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,196,...,Regularly busy,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
1,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,196,...,Regularly busy,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
2,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,196,...,Regularly busy,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
3,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,6184,...,Regularly busy,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
4,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,6184,...,Regularly busy,Slowest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both


In [30]:
# Checking for null values
df_merge_large.isnull().sum()

user_id                         0
first_name                1775118
surname                         0
gender                          0
state                           0
age                             0
n_dependants                    0
fam_status                      0
income                          0
product_id                      0
product_name                    0
aisle_id                        0
department_id                   0
prices                       5127
order_id                        0
order_number                    0
orders_day_of_week              0
order_hour_of_day               0
days_since_prior_order    2076096
add_to_cart_order               0
reordered                       0
price_range_loc                 0
busiest_day                     0
busiest_days                    0
busiest_period_of_day           0
max_order                       0
loyalty_flag                    0
mean_spend                      0
spender_flag                    0
median_order_f

Null values check out and are okay.

In [31]:
# Checking shape of new dataframe
df_merge_large.shape

(32404859, 32)

In [33]:
# Confirm the results of the merge using the merge flag.
df_merge_large['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [34]:
# dropping merge flag
df_merge_large_no_flag = df_merge_large.drop(columns = '_merge')

In [35]:
df_merge_large_no_flag.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,n_dependants,fam_status,income,product_id,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spend,spender_flag,median_order_frequency,order_frequency_flag
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,196,...,Mid-range product,Regularly busy,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
1,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,196,...,Mid-range product,Regularly busy,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
2,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,196,...,Mid-range product,Regularly busy,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
3,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,6184,...,Low-range product,Regularly busy,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
4,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665,6184,...,Low-range product,Regularly busy,Slowest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer


In [36]:
df_merge_large_no_flag.shape

(32404859, 31)

In [38]:
#8 Export Data
df_merge_large_no_flag.to_pickle(os.path.join(path, 'Data','Prepared Data', 'ords_prods_cust_merged.pkl'))