# Exercise 4.9 
## Part 1 Working with Customer Data

In [3]:
## Importing required packages

import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [4]:
## Importing the customer data

path = r'/Users/balachandark/Desktop/Instacart Basket Analysis'

customer = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_4.8.pkl'))

In [5]:
## Checking the column names

customer.columns

Index(['user_id', 'First Name', 'Surnam', 'Gender', 'STATE', 'Age',
       'date_joined', 'n_dependants', 'fam_status', 'income'],
      dtype='object')

In [6]:
## Checking the data

customer.head(10)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Cynthia,Noble,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Chris,Walton,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Joseph,Hickman,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Jeremy,Vang,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Shawn,Chung,Male,Virginia,26,1/1/2017,2,married,32072


In [7]:
## Renaming the columns 

customer.rename(columns={'First Name':'first_name', 'Surnam':'surname',
                        'STATE':'state', 'Age':'age', 'Gender':'gender', 'n_dependants':'no_of_dependants',
                        'fam_status':'family_status'}, inplace=True)

#### **Column checks (naming and dropping)**

In [9]:
## Checking the column names

customer.columns

Index(['user_id', 'first_name', 'surname', 'gender', 'state', 'age',
       'date_joined', 'no_of_dependants', 'family_status', 'income'],
      dtype='object')

In [10]:
## Checking the data types of the columns

customer.dtypes

user_id              int64
first_name          object
surname             object
gender              object
state               object
age                  int64
date_joined         object
no_of_dependants     int64
family_status       object
income               int64
dtype: object

##### Decision on dropping the columns: All the columns are important for data analysis. Hence, it is better to retain all of them. 

#### **Data Quality Checks**
##### **Missing values assessment**

In [13]:
## Checking for missing values

customer.isnull().sum()

user_id                 0
first_name          11259
surname                 0
gender                  0
state                   0
age                     0
date_joined             0
no_of_dependants        0
family_status           0
income                  0
dtype: int64

##### There are around 11259 customer whose first name is missing. This should not pose a problem. There is no need for data imputation as the missing information is an object. 

##### **Duplicate values assessment**

In [16]:
## Checking for duplicate values

customer_dups = customer[customer.duplicated()]
customer_dups

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,no_of_dependants,family_status,income


##### There are no duplicates values in the data frame

##### **Mixed data type values assessment**

In [19]:
customer.describe()

Unnamed: 0,user_id,age,no_of_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [20]:
for col in customer.columns.tolist():
  weird = (customer[[col]].map(type) != customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customer[weird]) > 0:
    print (col)

first_name


##### First name column has mixed data type. This could be due to the missing values. However, we will force them to make it as string. 

In [22]:
customer.first_name = customer.first_name.astype('str')

In [23]:
customer.dtypes

user_id              int64
first_name          object
surname             object
gender              object
state               object
age                  int64
date_joined         object
no_of_dependants     int64
family_status       object
income               int64
dtype: object

In [24]:
### Reassessing the data frame
for col in customer.columns.tolist():
  weird = (customer[[col]].map(type) != customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customer[weird]) > 0:
    print (col)

##### Now there are no mixed-data types in the data frame.

#### **Merging data frames**

In [27]:
## Adding the customer data frame to the orders and products data frame

ords_prods_merge.columns

Index(['order_id', 'user_id', 'evaluation_set', 'order_number',
       'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'busiest_day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'spending_behavior', 'spending_flag', 'order_frequent',
       'order_freq_flag'],
      dtype='object')

In [28]:
customer.columns

Index(['user_id', 'first_name', 'surname', 'gender', 'state', 'age',
       'date_joined', 'no_of_dependants', 'family_status', 'income'],
      dtype='object')

In [29]:
## Merging columns based on the user_id

ords_prods_customer = ords_prods_merge.merge(customer, on = 'user_id')

In [30]:
ords_prods_merge.shape

(32404859, 22)

In [31]:
ords_prods_customer.shape

(32404859, 31)

In [32]:
ords_prods_customer.columns

Index(['order_id', 'user_id', 'evaluation_set', 'order_number',
       'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'busiest_day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'spending_behavior', 'spending_flag', 'order_frequent',
       'order_freq_flag', 'first_name', 'surname', 'gender', 'state', 'age',
       'date_joined', 'no_of_dependants', 'family_status', 'income'],
      dtype='object')

##### After merging the data frames. The columns are checked to assess whether all the columns from two data frames are intact. Looking the above output, I confirm that all the required columns in present in the data frame. 

#### **Exporting data as pickle**

In [35]:
## Export the final data frame

ords_prods_customer.to_pickle(os.path.join(path, '02 Data', 'Prepared Data','ords_prods_customer.pkl'))

<center><b>End of part 1 report </b></centre>