# Exercise 4.9 PART 1: Intro to Data Visualization with Python

## Contents:

    0. Import Libraries

    1. Loading and Checking the Data

    2. Wrangling the Data

    3. Data Quality Checks

    4. Combining Customer Data with Previously Prepared Data

    5. Export the New Dataframe as a Pickle

## 0. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## 1. Loading and Checking the Data

In [2]:
# Define the path to the data files
path = '/Users/aaronkibler/CF Project 4 - Instacart Basket Analysis/02 Data'

In [3]:
# Load "customers.csv" from the "Original Data" folder as "df_customers"
df_customers = pd.read_csv(os.path.join(path, 'Original Data', 'customers.csv'), index_col = False)

In [4]:
# Checking "customers.csv" data is correctly loaded
print(df_customers.head())
print(df_customers.info())
df_customers.shape

   user_id First Name    Surnam  Gender       STATE  Age date_joined  \
0    26711    Deborah  Esquivel  Female    Missouri   48    1/1/2017   
1    33890   Patricia      Hart  Female  New Mexico   36    1/1/2017   
2    65803    Kenneth    Farley    Male       Idaho   35    1/1/2017   
3   125935   Michelle     Hicks  Female        Iowa   40    1/1/2017   
4   130797        Ann   Gilmore  Female    Maryland   26    1/1/2017   

   n_dependants fam_status  income  
0             3    married  165665  
1             0     single   59285  
2             2    married   99568  
3             0     single   42049  
4             1    married   40374  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        

(206209, 10)

## 2. Wrangling the Data

In [5]:
# Renaming columns for consistency and clarity
df_customers.rename(columns={'First Name': 'first_name', 'Surnam': 'surname', 'Gender': 'gender', 'STATE': 'state', 'Age': 'age', 'n_dependants': 'dependants', 'fam_status': 'family_status', 'income': 'income'}, inplace=True)

In [6]:
# Verify the names of the columns after making changes
print(df_customers.columns)

Index(['user_id', 'first_name', 'surname', 'gender', 'state', 'age',
       'date_joined', 'dependants', 'family_status', 'income'],
      dtype='object')


In [7]:
# Check data types and make adjustments as needed
print(df_customers.dtypes)

user_id           int64
first_name       object
surname          object
gender           object
state            object
age               int64
date_joined      object
dependants        int64
family_status    object
income            int64
dtype: object


In [8]:
# Convert "user_id" data type to "string"
df_customers['user_id'] = df_customers['user_id'].astype(str)

In [9]:
# Convert "date_joined" to data type datetime
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

In [10]:
# Recheck data types after making changes
print(df_customers.dtypes)

user_id                  object
first_name               object
surname                  object
gender                   object
state                    object
age                       int64
date_joined      datetime64[ns]
dependants                int64
family_status            object
income                    int64
dtype: object


## 3. Data Quality Checks

In [11]:
# Check for duplicates
print("Duplicates:", df_customers.duplicated().sum())

Duplicates: 0


In [12]:
# Check for mixed data types in each column
for col in df_customers.columns.tolist():
    weird = (df_customers[[col]].map(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_customers[weird]) > 0:
        print (col)

first_name


In [13]:
# The "first_name" column should have a data type "string"
df_customers['first_name'] = df_customers['first_name'].astype('str')

In [14]:
# Recheck data types after making changes
print(df_customers.dtypes)

user_id                  object
first_name               object
surname                  object
gender                   object
state                    object
age                       int64
date_joined      datetime64[ns]
dependants                int64
family_status            object
income                    int64
dtype: object


In [15]:
# Recheck for mixed data types
for col in df_customers.columns.tolist():
    weird = (df_customers[[col]].map(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_customers[weird]) > 0:
        print (col)

In [16]:
# Check for missing values
print(df_customers.isnull().sum())

user_id          0
first_name       0
surname          0
gender           0
state            0
age              0
date_joined      0
dependants       0
family_status    0
income           0
dtype: int64


In [17]:
# Check the descriptive stats for anything unusal
df_customers.describe()

Unnamed: 0,age,date_joined,dependants,income
count,206209.0,206209,206209.0,206209.0
mean,49.501646,2018-08-17 03:06:30.029532928,1.499823,94632.852548
min,18.0,2017-01-01 00:00:00,0.0,25903.0
25%,33.0,2017-10-23 00:00:00,0.0,59874.0
50%,49.0,2018-08-16 00:00:00,1.0,93547.0
75%,66.0,2019-06-10 00:00:00,3.0,124244.0
max,81.0,2020-04-01 00:00:00,3.0,593901.0
std,18.480962,,1.118433,42473.786988


### All quality checks now appear okay

## 4. Combining Customer Data with Previously Prepared Data

In [18]:
# Load the most up-to-date version of the previously prepared data as "df_ords_prods_new"
df_ords_prods_new = pd.read_pickle(os.path.join(path, 'Prepared Data', 'ords_prods_merge_new_var_group_agg.pkl'))


In [19]:
# Checking "ords_prods_merge_new_var_group_agg.pkl" data is correctly loaded
print(df_ords_prods_new.head())
print(df_ords_prods_new.info())
df_ords_prods_new.shape

   product_id                product_name  aisle_id  department_id  prices  \
0           1  Chocolate Sandwich Cookies        61             19     5.8   
1           1  Chocolate Sandwich Cookies        61             19     5.8   
2           1  Chocolate Sandwich Cookies        61             19     5.8   
3           1  Chocolate Sandwich Cookies        61             19     5.8   
4           1  Chocolate Sandwich Cookies        61             19     5.8   

   order_id  user_id  order_number  orders_day_of_week  order_hour_of_day  \
0   3139998      138            28                   6                 11   
1   1977647      138            30                   6                 17   
2    389851      709             2                   0                 21   
3    652770      764             1                   3                 13   
4   1813452      764             3                   4                 17   

   ...    price_range_loc     busiest_day  busiest_days  \
0  ...  M

(32404859, 25)

### Both "df_customers" and "df_ords_prods_new" have a "user_id" column which we can use to merge the two datasets together.
### "user_id" in the "df_ords_prods_new" dataframe must first be converted to "string" to match the "user_id" column in "df_customers"
### At the same time, the other identifier columns in the dataframe can also be converted to "string"
### Identifier columns to convert to "string": "product_id", "aisle_id", "department_id", "order_id", and "user_id"

In [20]:
# Convert the identifier columns in "df_ords_prods_new" to "string"
df_ords_prods_new[['product_id', 'aisle_id', 'department_id', 'order_id', 'user_id']] = df_ords_prods_new[['product_id', 'aisle_id', 'department_id', 'order_id', 'user_id']].astype(str)

In [21]:
# Check the results of the change
print(df_ords_prods_new.dtypes)

product_id                  object
product_name                object
aisle_id                    object
department_id               object
prices                     float64
order_id                    object
user_id                     object
order_number                 int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
is_first_order               int64
add_to_cart_order            int64
reordered                    int64
_merge                    category
price_range_loc             object
busiest_day                 object
busiest_days                object
busiest_period_of_day       object
max_order                    int64
loyalty_flag                object
mean_product_price         float64
spending_flag               object
order_frequency            float64
frequency_flag              object
dtype: object


In [22]:
# Drop the existing "_merge" column from "df_ords_prods_new"
df_ords_prods_new = df_ords_prods_new.drop(columns=['_merge'])

In [23]:
# Check the columns after the change
print(df_ords_prods_new.columns)

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'is_first_order',
       'add_to_cart_order', 'reordered', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'mean_product_price', 'spending_flag', 'order_frequency',
       'frequency_flag'],
      dtype='object')


In [24]:
# Merge the two dataframes using the default inner join
df_final_merged = df_ords_prods_new.merge(df_customers, on='user_id', indicator=True)

In [25]:
# Check "value_counts" after inner join
df_final_merged['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [26]:
# Check the details of the new "df_final_merged" dataframe
print(df_final_merged.head())
print(df_final_merged.info())
df_final_merged.shape

  product_id                product_name aisle_id department_id  prices  \
0          1  Chocolate Sandwich Cookies       61            19     5.8   
1          1  Chocolate Sandwich Cookies       61            19     5.8   
2        907        Premium Sliced Bacon      106            12    20.0   
3        907        Premium Sliced Bacon      106            12    20.0   
4       1000                    Apricots       18            10    12.9   

  order_id user_id  order_number  orders_day_of_week  order_hour_of_day  ...  \
0  3139998     138            28                   6                 11  ...   
1  1977647     138            30                   6                 17  ...   
2  3160996     138             1                   5                 13  ...   
3  2254091     138            10                   5                 14  ...   
4   505689     138             9                   6                 12  ...   

   first_name  surname  gender      state age date_joined dependants

(32404859, 34)

## 5. Export the New Dataframe as a Pickle

In [27]:
# Export the "df_final_merged" dataframe as "ords_prods_cust_merge" for use in PART 2
df_final_merged.to_pickle(os.path.join(path, 'Prepared Data', 'ords_prods_cust_merge.pkl'))

# End of Exercise 4.9 PART 1