## 4.9.1 Part 1 Visualization Task

### This script contains the following points:

#### 1. Import Customer Data & Check Dimensions
#### 2. Wrangle Customer Data
#### 3. Consistency Check Customer Data
#### 4. Combine Customer Data with current project data (ords_prods_merge_agg_cln.pkl)
#### 5. Export file

### 01. Import data & Check dimensions

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Create string path for main project folder

path = r'/Users/AngieUS/Desktop/Instacart Project'

# Import Customer Data

df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# Check Customer Data

df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [3]:
# Display dimensions of df_cust - rows, columns

df_cust.shape

(206209, 10)

In [4]:
# Import Main Project Data (ords_prods_merge_agg_cln.pkl)

df_main = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_agg_cln.pkl'))

# Check Main Project Data

df_main.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,_merge,price_range,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_prices,spending_flag,median_days,order_freq_flag
0,2398795,1,2,3,7,15.0,196,1,1,Soda,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,...,both,Low-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,...,both,Low-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,...,both,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,...,both,Low-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer


In [5]:
# Display dimensions of df_main - rows, columns

df_main.shape

(30323987, 23)

#### Display exploratory info for Customer Data

In [7]:
#Display df_cust info

df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [11]:
# Display descriptive analysis df_cust - 'Age', 'n_dependants', 'income'

df_cust[['Age', 'n_dependants', 'income']].describe().round(2)

Unnamed: 0,Age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.5,1.5,94632.85
std,18.48,1.12,42473.79
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


### 02. Wrangle Customer Data

In [12]:
# Check the frequency of the 'date_joined'column

df_cust['date_joined'].value_counts(dropna = False)

date_joined
9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: count, Length: 1187, dtype: int64

####  Checked the 'date_joined' column for variety of values. If it was only one value, then it would not have been viable for any analysis and could have been dropped. However, it does have potential value so it will remain in the dataframe. There are not columns that need to be removed.

In [13]:
# Rename columns for logic and format consistency

df_cust.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [14]:
# Rename columns for logic and format consistency

df_cust.rename(columns = {'Surnam' : 'last_name'}, inplace = True)

In [15]:
# Rename columns for logic and format consistency

df_cust.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [16]:
# Rename columns for logic and format consistency

df_cust.rename(columns = {'STATE' : 'state'}, inplace = True)

In [17]:
# Rename columns for logic and format consistency

df_cust.rename(columns = {'Age' : 'age'}, inplace = True)

In [18]:
# Rename columns for logic and format consistency

df_cust.rename(columns = {'n_dependants' : 'dependents'}, inplace = True)

In [21]:
# Display columns

df_cust.columns

Index(['user_id', 'first_name', 'last_name', 'gender', 'state', 'age',
       'date_joined', 'dependents', 'fam_status', 'income'],
      dtype='object')

In [23]:
# Change user_id data type to string: it is a key (non-numeric) and must match the key in the main project data

df_cust['user_id'] = df_cust['user_id'].astype('str')

# Check df_cust 'user_id' data type

df_cust['user_id'].dtype

dtype('O')

In [24]:
# Display dimension of df_cust

df_cust.shape

(206209, 10)

### 03. Consistency Check Customer Data

In [25]:
# Check for mixed data types in df_cust

for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].map(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

first_name


In [26]:
# Check data types for the first_name column

df_cust['first_name'].apply(type).value_counts()

first_name
<class 'str'>      194950
<class 'float'>     11259
Name: count, dtype: int64

#### 11259 is the null number from our original .info() display - these are records missing first names

In [27]:
# Find missing data in df_cust

df_cust.isnull().sum()

user_id            0
first_name     11259
last_name          0
gender             0
state              0
age                0
date_joined        0
dependents         0
fam_status         0
income             0
dtype: int64

In [31]:
# Create new dataframe df_cust_clean with missing first_name records removed

df_cust_clean = df_cust[df_cust['first_name'].isnull() == False]

# Display dimensions of df_cust_clean - rows, columns

df_cust_clean.shape

(194950, 10)

In [33]:
# Display descriptive analysis df_cust_clean - 'age', 'dependents', 'income'

df_cust_clean[['age', 'dependents', 'income']].describe().round(2)

Unnamed: 0,age,dependents,income
count,194950.0,194950.0,194950.0
mean,49.51,1.5,94664.81
std,18.48,1.12,42477.26
min,18.0,0.0,25903.0
25%,33.0,0.0,59924.0
50%,49.0,1.0,93572.5
75%,66.0,2.0,124261.0
max,81.0,3.0,593901.0


#### Removing the records with missing objects in first_name did not impact the descriptive results for the continuous variables. In addition, the 11259 records made up only 5% of total data.

In [34]:
# Display any duplicates in df_cust_clean
df_dups = df_cust_clean[df_cust_clean.duplicated()]

df_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,dependents,fam_status,income


#### No duplicates found.

### 04. Combine Customer Data with current project data

In [36]:
# Remove previous indicator column (_merge) from df_main

df_main = df_main.drop(['_merge'], axis=1)

In [37]:
# Tell pandas not to truncate columns

pd.set_option('display.max_columns', None)

In [38]:
# Create new df that contains the combined df_cust_clean and df_main dataframes using “user_id” = key

df_all = df_main.merge(df_cust_clean, on='user_id', indicator=True)

# Check df_all
df_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_prices,spending_flag,median_days,order_freq_flag,first_name,last_name,gender,state,age,date_joined,dependents,fam_status,income,_merge
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,117,19,3.0,Low-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,4.4,Low-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,10.3,Mid-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Least busy,Average orders,10,New customer,6.372222,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [40]:
# Display dimensions of df_all

df_all.shape

(28662590, 32)

In [41]:
# Display merge flag frequency for df_all

df_all['_merge'].value_counts()

_merge
both          28662590
left_only            0
right_only           0
Name: count, dtype: int64

### 05. Export file

In [42]:
# Export data (pkl file) as orders_products_all

df_all.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_all.pkl'))