# Importing libraries

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
path = r"C:\Users\cavba\Documents\Instacart Basket Analysis"

In [3]:
customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv' ), index_col = False)

In [4]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [5]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
customers.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


In [7]:
customers.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

### Updating column headings

In [8]:
#Changing heading from Surname to LastName
customers = customers.rename(columns={'Surnam': 'LastName'})

In [9]:
# Changing columns name from n_dependents to number_of_dependants
customers = customers.rename(columns={'n_dependants':'number_of_dependants'})

In [10]:
#changing fam_status to family_status
customers = customers.rename(columns={'fam_status':'family_status'})

In [11]:
customers.head()

Unnamed: 0,user_id,First Name,LastName,Gender,STATE,Age,date_joined,number_of_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


Changing the headings makes it easier to understand what each field represents. I'm choosing to leave all exisitng columns in the data set because they could be used for different comparisons later. 

In [12]:
customers.describe()

Unnamed: 0,user_id,Age,number_of_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


## Data Consistency Checks

In [13]:
# Check for mixed types
for col in customers.columns.tolist():
  weird = (customers[[col]].applymap(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

First Name


Since the First Name data type is already set as a string I will leave it as is

In [14]:
# Checking for empty fields
customers.isnull().sum()

user_id                     0
First Name              11259
LastName                    0
Gender                      0
STATE                       0
Age                         0
date_joined                 0
number_of_dependants        0
family_status               0
income                      0
dtype: int64

In [15]:
# Creating a subset of customers to view missing values
df_nan = customers[customers['First Name'].isnull()==True]

In [16]:
df_nan

Unnamed: 0,user_id,First Name,LastName,Gender,STATE,Age,date_joined,number_of_dependants,family_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3/31/2020,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,4/1/2020,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,4/1/2020,1,married,45275
206162,187532,,Floyd,Female,California,39,4/1/2020,0,single,56325


In [17]:
customers.shape

(206209, 10)

In [18]:
#Creating a subset without null First Name values
customers_clean = customers[customers['First Name'].isnull()==False]

In [19]:
customers_clean.shape

(194950, 10)

The shape if the subset is accurate excluding all of the empty First Name fields

In [20]:
#Checking to make sure all null values are gone
customers_clean.isnull().sum()

user_id                 0
First Name              0
LastName                0
Gender                  0
STATE                   0
Age                     0
date_joined             0
number_of_dependants    0
family_status           0
income                  0
dtype: int64

In [21]:
# Checking for duplicates
df_dups = customers_clean[customers_clean.duplicated()]

In [22]:
df_dups

Unnamed: 0,user_id,First Name,LastName,Gender,STATE,Age,date_joined,number_of_dependants,family_status,income


In [23]:
df_dups.shape

(0, 10)

There are no duplicates in the custoemrs_clean data set

In [24]:
#Exporting cleaned customers data set to prepared data folder
customers_clean.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'customers_cleaned'))

In [24]:
customers_clean.head()

Unnamed: 0,user_id,First Name,LastName,Gender,STATE,Age,date_joined,number_of_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [30]:
ords_prods_merge = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_frequencies.csv'))

In [31]:
ords_prods_merge.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,...,price_range_loc,busiest day,Result,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_order,frequency_flag
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,...,Mid-range product,Regular busy,regular days,Most orders,32,Regular customer,14.790541,High Spender,8.0,Frequent Customer
1,1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,...,Mid-range product,Regular busy,regular days,Most orders,32,Regular customer,14.790541,High Spender,8.0,Frequent Customer
2,2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,...,Mid-range product,Busiest day,busiest days,Average orders,5,New customer,3.0625,Low Spender,8.0,Frequent Customer
3,3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,...,Mid-range product,Regular busy,slowest days,Most orders,3,New customer,2.017241,Low Spender,9.0,Frequent Customer
4,4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,...,Mid-range product,Least busy,slowest days,Most orders,3,New customer,2.017241,Low Spender,9.0,Frequent Customer


In [32]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19403302 entries, 0 to 19403301
Data columns (total 26 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0              int64  
 1   product_id              int64  
 2   product_name            object 
 3   aisle_id                int64  
 4   department_id           int64  
 5   prices                  float64
 6   order_id                int64  
 7   user_id                 int64  
 8   eval_set                object 
 9   order_number            int64  
 10  orders_day_of_the_week  int64  
 11  order_hour_of_day       int64  
 12  days_since_prior_order  float64
 13  add_to_cart_order       int64  
 14  reordered               int64  
 15  _merge                  object 
 16  price_range_loc         object 
 17  busiest day             object 
 18  Result                  object 
 19  busiest_period_of_day   object 
 20  max_order               int64  
 21  loyalty_flag            objec

In [33]:
ords_prods_merge.isnull().sum()

Unnamed: 0                      0
product_id                      0
product_name                    0
aisle_id                        0
department_id                   0
prices                          0
order_id                        0
user_id                         0
eval_set                        0
order_number                    0
orders_day_of_the_week          0
order_hour_of_day               0
days_since_prior_order    1238282
add_to_cart_order               0
reordered                       0
_merge                          0
price_range_loc                 0
busiest day                     0
Result                          0
busiest_period_of_day           0
max_order                       0
loyalty_flag                    0
avg_price                       0
spending_flag                   0
median_order                    4
frequency_flag                  4
dtype: int64

In [34]:
ords_prods_merge.drop(columns = ['Unnamed: 0'])

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,orders_day_of_the_week,...,price_range_loc,busiest day,Result,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_order,frequency_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,6,...,Mid-range product,Regular busy,regular days,Most orders,32,Regular customer,14.790541,High Spender,8.0,Frequent Customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,6,...,Mid-range product,Regular busy,regular days,Most orders,32,Regular customer,14.790541,High Spender,8.0,Frequent Customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,0,...,Mid-range product,Busiest day,busiest days,Average orders,5,New customer,3.062500,Low Spender,8.0,Frequent Customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,3,...,Mid-range product,Regular busy,slowest days,Most orders,3,New customer,2.017241,Low Spender,9.0,Frequent Customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,4,...,Mid-range product,Least busy,slowest days,Most orders,3,New customer,2.017241,Low Spender,9.0,Frequent Customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19403297,29662,Potato Yukon Gold Organic,83,4,3.6,194185,203834,prior,33,6,...,Low-range product,Regular busy,regular days,Most orders,53,Loyal customer,26.995283,High Spender,5.0,Frequent Customer
19403298,29662,Potato Yukon Gold Organic,83,4,3.6,2082979,203834,prior,36,4,...,Low-range product,Least busy,slowest days,Most orders,53,Loyal customer,26.995283,High Spender,5.0,Frequent Customer
19403299,29662,Potato Yukon Gold Organic,83,4,3.6,239800,203834,prior,43,2,...,Low-range product,Regular busy,regular days,Most orders,53,Loyal customer,26.995283,High Spender,5.0,Frequent Customer
19403300,29662,Potato Yukon Gold Organic,83,4,3.6,2073065,203834,prior,52,3,...,Low-range product,Regular busy,slowest days,Most orders,53,Loyal customer,26.995283,High Spender,5.0,Frequent Customer


In [30]:
ords_prods_merge.shape

(32434489, 28)

In [36]:
df_nan=ords_prods_merge[ords_prods_merge['median_order'].isnull()==True]

In [37]:
df_nan

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,...,price_range_loc,busiest day,Result,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_order,frequency_flag
6234909,6234909,10749,Organic Red Bell Pepper,83,4,5.7,895835,159838,prior,1,...,Mid-range product,Busiest day,busiest days,Most orders,1,New customer,1.0,Low Spender,,
12947653,12947653,21334,Organic Peeled Garlic,123,4,10.2,895835,159838,prior,1,...,Mid-range product,Busiest day,busiest days,Most orders,1,New customer,1.0,Low Spender,,
13839012,13839012,22198,4X Ultra Concentrated Natural Laundry Detergen...,75,17,1.7,895835,159838,prior,1,...,Low-range product,Busiest day,busiest days,Most orders,1,New customer,1.0,Low Spender,,
14758536,14758536,23695,California Veggie Burger,42,1,4.7,895835,159838,prior,1,...,Low-range product,Busiest day,busiest days,Most orders,1,New customer,1.0,Low Spender,,


In [38]:
df_clean = ords_prods_merge[ords_prods_merge['median_order'].isnull()==False]

In [39]:
#Confirming shape is correct after removing null values in median_order field
df_clean.shape

(19403298, 26)

In [40]:
df_nan.shape

(4, 26)

In [41]:
ords_prods_merge.shape

(19403302, 26)

## Merging customers with orders_products_merge

In [42]:
customers_clean.shape

(194950, 10)

In [43]:
df_all = customers_clean.merge(df_clean, on =['user_id'])

In [44]:
df_all.head()

Unnamed: 0,user_id,First Name,LastName,Gender,STATE,Age,date_joined,number_of_dependants,family_status,income,...,price_range_loc,busiest day,Result,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_order,frequency_flag
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regular busy,busiest days,Most orders,8,New customer,5.111111,Low Spender,19.0,Regular Customer
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regular busy,regular days,Most orders,8,New customer,5.111111,Low Spender,19.0,Regular Customer
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regular busy,busiest days,Most orders,8,New customer,5.111111,Low Spender,19.0,Regular Customer
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Low-range product,Regular busy,regular days,Most orders,8,New customer,5.111111,Low Spender,19.0,Regular Customer
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Low-range product,Regular busy,slowest days,Most orders,8,New customer,5.111111,Low Spender,19.0,Regular Customer


In [45]:
df_all.isnull().sum()

user_id                         0
First Name                      0
LastName                        0
Gender                          0
STATE                           0
Age                             0
date_joined                     0
number_of_dependants            0
family_status                   0
income                          0
Unnamed: 0                      0
product_id                      0
product_name                    0
aisle_id                        0
department_id                   0
prices                          0
order_id                        0
eval_set                        0
order_number                    0
orders_day_of_the_week          0
order_hour_of_day               0
days_since_prior_order    1170531
add_to_cart_order               0
reordered                       0
_merge                          0
price_range_loc                 0
busiest day                     0
Result                          0
busiest_period_of_day           0
max_order     

In [46]:
df_all.shape

(18339472, 35)

In [47]:
df_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merge.pkl'))