# IC 4.9 Part 1_Data Wrangling, Consistency on customer data, Merge final data sets

# Table of Contents
##### 1. Import libraries
##### 2. Import data
##### 3. Data Wrangling on customer data
##### 4. Data quality and consistency checks on customer data
    4.1 Check for mixed-type data
    4.2 Histogram
    4.2 Check for missing values
    4.3 Check for duplicates
##### 5. Combine customer data with the ords_products_merged data
    5.1 Before merging, ensure each dataframe is prepared
    5.2 Merge ords_products_merged data with customer data
##### 6. Export charts

## 1. Import Libraries

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

## 2. Import Data

In [4]:
# Import customer data

path = r'C:\Users\ashle\Documents\07-2024 Instacart Basket Analysis'
cu = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merged_with_flags_checked.pkl'))

## 3. Data Wrangling on customer data

In [6]:
# Observe dataframe
cu.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
# Check shape of cu
cu.shape

(206209, 10)

First and Surnam columns are not relevant to the analysis. All other columns are.

In [12]:
# Drop columns 'First Name' and 'Surnam'
cu_wrangled = cu.drop(columns=['First Name', 'Surnam'])

In [18]:
# Confirm unnecessary columns were dropped
cu_wrangled.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [20]:
# Determine dimensions of the dataframe
cu_wrangled.shape

(206209, 8)

In [22]:
# View data types for each variable
cu_wrangled.dtypes

user_id          int64
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [24]:
# Update date_joined datatype to pandas datetime64[ns] data type
cu_wrangled['date_joined'] = cu_wrangled['date_joined'].astype('datetime64[ns]')

In [26]:
# Confirm data type change was successful
cu_wrangled.dtypes

user_id                  int64
Gender                  object
STATE                   object
Age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status              object
income                   int64
dtype: object

In [28]:
# Rename 'n_dependants' column to 'number_of_dependents'
cu_wrangled.rename(columns={'n_dependants' : 'number_of_dependents'}, inplace=True)

In [30]:
# Rename 'fam_status' column to 'family_status'
cu_wrangled.rename(columns={'fam_status' : 'family_status'}, inplace=True)

In [32]:
# Confirm column names were successfully updated
cu_wrangled.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,number_of_dependents,family_status,income
0,26711,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Female,Maryland,26,2017-01-01,1,married,40374


In [6]:
# Export wrangled customer data

In [34]:
cu_wrangled.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'customers_wrangled.csv'))

## 4. Data quality and consistency checks on customer data

### 4.1 Check for mixed-type data

In [36]:
# Check for mixed types on cu_wrangled

for col in cu_wrangled.columns.tolist():
    weird = (cu_wrangled[[col]].map(type) != cu_wrangled[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(cu_wrangled[weird]) > 0:
        print (col)
    else: 
        print('There is no mixed-type data')

There is no mixed-type data
There is no mixed-type data
There is no mixed-type data
There is no mixed-type data
There is no mixed-type data
There is no mixed-type data
There is no mixed-type data
There is no mixed-type data


There is no mixed-type data in the cu_wrangled dataframe. 

###  4.2 Check for missing values

In [38]:
# Check for missing values in cu_wrangled

cu_wrangled.isnull().sum()

user_id                 0
Gender                  0
STATE                   0
Age                     0
date_joined             0
number_of_dependents    0
family_status           0
income                  0
dtype: int64

There are no missing values in the cu_wrangled dataframe. 

### 4.3 Check for duplicates

In [41]:
# Check for duplicate rows and assign to subset cu_wrangled_dups
cu_wrangled_dups = cu_wrangled[cu_wrangled.duplicated()]

In [43]:
# If subset returns empty, that means there are no duplicates
cu_wrangled_dups

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,number_of_dependents,family_status,income


There are no duplicates in the cu_wrangled dataframe. 

In [45]:
# Confirm dimensions of cu_wrangled dataframe. No changes were made so we expect the dimensions to be the same. 
cu_wrangled.shape

(206209, 8)

Data wrangling and cleaning is complete. 

In [13]:
# Export cu_wrangled dataframe with consistency checks complete

In [47]:
cu_wrangled.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'customers_checked.csv'))

## 5. Combine customer data with the ords_products_merged data

### 5.1 Before merging, ensure each dataframe is prepared

In [49]:
ords_prods_merge.dtypes

order_id                    int64
user_id                     int64
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
first_order                  bool
product_id                  int64
add_to_cart_order           int64
reordered                   int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
price_label                object
busiest_day                object
busiest_days               object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
avg_purchase_price        float64
spending_flag              object
median_days               float64
order_frequency_flag       object
dtype: object

In [51]:
cu_wrangled.dtypes

user_id                          int64
Gender                          object
STATE                           object
Age                              int64
date_joined             datetime64[ns]
number_of_dependents             int64
family_status                   object
income                           int64
dtype: object

Common key: 'user_id'

In [53]:
cu_wrangled.describe()

Unnamed: 0,user_id,Age,date_joined,number_of_dependents,income
count,206209.0,206209.0,206209,206209.0,206209.0
mean,103105.0,49.501646,2018-08-17 03:06:30.029532928,1.499823,94632.852548
min,1.0,18.0,2017-01-01 00:00:00,0.0,25903.0
25%,51553.0,33.0,2017-10-23 00:00:00,0.0,59874.0
50%,103105.0,49.0,2018-08-16 00:00:00,1.0,93547.0
75%,154657.0,66.0,2019-06-10 00:00:00,3.0,124244.0
max,206209.0,81.0,2020-04-01 00:00:00,3.0,593901.0
std,59527.555167,18.480962,,1.118433,42473.786988


Note: the max income of $593,901 is very high. Monitor this. 

Before merging, update column name 'order_dow' to 'order_day_of_week' and drop previous merge_flag column since we'll be creating a new one for this merge. 

In [55]:
# Rename column name 'order_dow' to 'order_day_of_week'
ords_prods_merge.rename(columns={'order_dow' : 'order_day_of_week'}, inplace=True)

In [57]:
# Confirm column name change was successful. 
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'price_label', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_purchase_price', 'spending_flag', 'median_days',
       'order_frequency_flag'],
      dtype='object')

### 5.2 Merge ords_products_merged data with customer data

In [67]:
# Merge ords_prods_merge with cu_wrangled using an inner join on the common key 'user_id' and add a merge flag

orders_products_all = ords_prods_merge.merge(cu_wrangled, on = 'user_id', indicator = True)

In [68]:
orders_products_all.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,median_days,order_frequency_flag,Gender,STATE,Age,date_joined,number_of_dependents,family_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2539329,1,1,2,8,,True,14084,2,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,2539329,1,1,2,8,,True,12427,3,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2539329,1,1,2,8,,True,26088,4,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,2539329,1,1,2,8,,True,26405,5,0,...,20.5,Non-frequent customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [69]:
# Check frequency of merge flag
orders_products_all['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

Data successfully merged.

In [70]:
orders_products_all.shape

(32404859, 32)

## 6. Export data

In [75]:
orders_products_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))