# Table of Contents

## 1. Importing Libraries
## 2. Importing Data
## 3. Wrangling Data
### 3.1 Renaming Columns
### 3.2 Deleting Columns
## 4. Consistency Checks
### 4.1 Missing Values
### 4.2 Duplicates
### 4.3 Mixed-type Data
## 5. Combining Data
## 6. Exporting Data


# 1. Importing Libraries


In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy


# 2. Importing Data


In [4]:
# Importing data set customers.csv
df_cust = pd.read_csv(r'/Users/xxx/Documents/Instacart Basket Analysis - 2025-01-05/02 - Data/Original Data/customers.csv', index_col = False)

In [5]:
# Display data frame
df_cust

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...,...
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799


In [6]:
#Finding Missing Values
df_cust.isnull().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64


# 3. Wrangling Data



## 3.1 Renaming columns


In [9]:
# Rename columns in the data frame
df_cust.rename(columns={
    "First Name": "first_name",
    "Surnam": "surname",
    "Gender": "gender",
    "STATE": "state",
    "Age": "age",
    "n_dependants": "num_dependants",
    "fam_status": "family_status"
}, inplace=True)

# Display the updated column names to verify
print(df_cust.columns)

Index(['user_id', 'first_name', 'surname', 'gender', 'state', 'age',
       'date_joined', 'num_dependants', 'family_status', 'income'],
      dtype='object')



## 3.2 Deleting Columns that don't add value to analysis


In [11]:
# Individual first names and surnames are unlikely to contribute to analytical insights, 
# as they are not relevant to patterns, trends, or relationships in the data. 
# They are generally useful only for personalization purposes.

In [12]:
print(df_cust.columns)

Index(['user_id', 'first_name', 'surname', 'gender', 'state', 'age',
       'date_joined', 'num_dependants', 'family_status', 'income'],
      dtype='object')


In [13]:
# Delete the 'first_name' and 'surname' columns from the data frame
df_cust.drop(columns=['first_name', 'surname'], inplace=True)

# Display the data frame
df_cust

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,family_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...
206204,168073,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Female,California,27,4/1/2020,1,married,99799



# 4. Consistency Checks


In [15]:
#Simple Check
df_cust.describe()

Unnamed: 0,user_id,age,num_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0



## 4.1 Missing Values


In [17]:
#Finding Missing Values
df_cust.isnull().sum()

user_id           0
gender            0
state             0
age               0
date_joined       0
num_dependants    0
family_status     0
income            0
dtype: int64

In [18]:
# Perfect. No missing values


## 4.2 Duplicates


In [20]:
#Look for full duplicates within dataframe
df_dups = df_cust[df_cust.duplicated()]

In [21]:
df_dups

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,family_status,income


In [22]:
# Perfect. No duplicates


## 4.3 Mixed-type data


In [24]:
df_cust.dtypes

user_id            int64
gender            object
state             object
age                int64
date_joined       object
num_dependants     int64
family_status     object
income             int64
dtype: object

In [25]:
# Check for mixed data types in df_cust
for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].map(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

In [26]:
# No rows with mismatched types but "gender", "state", and "family_status" should be categorical, while "date_joined" should be datetime

In [27]:
# Convert gender, state, and family_status to 'category'
df_cust['gender'] = df_cust['gender'].astype('category')
df_cust['state'] = df_cust['state'].astype('category')
df_cust['family_status'] = df_cust['family_status'].astype('category')

# Convert date_joined to 'datetime'
df_cust['date_joined'] = pd.to_datetime(df_cust['date_joined'])

# Verify changes
print(df_cust.dtypes)

user_id                    int64
gender                  category
state                   category
age                        int64
date_joined       datetime64[ns]
num_dependants             int64
family_status           category
income                     int64
dtype: object


In [28]:
df_cust

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,family_status,income
0,26711,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Female,Maryland,26,2017-01-01,1,married,40374
...,...,...,...,...,...,...,...,...
206204,168073,Female,North Carolina,44,2020-04-01,1,married,148828
206205,49635,Male,Hawaii,62,2020-04-01,3,married,168639
206206,135902,Female,Missouri,66,2020-04-01,2,married,53374
206207,81095,Female,California,27,2020-04-01,1,married,99799



# 5. Combining Data


In [30]:
# Import ords_prods_deps_48.pkl
ords_prods_deps = pd.read_pickle(r'/Users/xxx/Documents/Instacart Basket Analysis - 2025-01-05/02 - Data/Prepared Data/ords_prods_deps_48.pkl')

In [31]:
ords_prods_deps.dtypes

order_id                    int64
user_id                     int64
order_number                int64
order_weekday               int64
order_hour                  int64
days_since_last_order       int64
product_id                  int64
add_to_cart_order           int64
reordered                   int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
department                 object
final_merge_indicator    category
price_range              category
busiest_day                object
busiest_days               object
slowest_days               object
order_time_of_day          object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
spending_flag              object
order_frequency_flag       object
dtype: object

In [32]:
# Merge df_cust and ords_prods_deps on 'user_id' column
ords_prods_deps_cust = pd.merge(df_cust, ords_prods_deps, on='user_id', how='inner')

# Display the data frame
ords_prods_deps_cust

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,family_status,income,order_id,order_number,...,price_range,busiest_day,busiest_days,slowest_days,order_time_of_day,busiest_period_of_day,max_order,loyalty_flag,spending_flag,order_frequency_flag
0,26711,Female,Missouri,48,2017-01-01,3,married,165665,518967,1,...,Low-range product,Regular day,Regular day,Regular day,Average orders,Average orders,8,New customer,Low spender,Regular customer
1,26711,Female,Missouri,48,2017-01-01,3,married,165665,423547,2,...,Mid-range product,Regular day,Regular day,Regular day,Average orders,Average orders,8,New customer,Low spender,Regular customer
2,26711,Female,Missouri,48,2017-01-01,3,married,165665,2524893,3,...,Mid-range product,Regular day,Regular day,Slowest days,Most orders,Most orders,8,New customer,Low spender,Regular customer
3,26711,Female,Missouri,48,2017-01-01,3,married,165665,2524893,3,...,Low-range product,Regular day,Regular day,Slowest days,Most orders,Most orders,8,New customer,Low spender,Regular customer
4,26711,Female,Missouri,48,2017-01-01,3,married,165665,2524893,3,...,Mid-range product,Regular day,Regular day,Slowest days,Most orders,Most orders,8,New customer,Low spender,Regular customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434207,80148,Female,New York,55,2020-04-01,1,married,57095,2859858,3,...,Mid-range product,Regular day,Regular day,Regular day,Most orders,Most orders,4,New customer,Low spender,Regular customer
32434208,80148,Female,New York,55,2020-04-01,1,married,57095,2859858,3,...,Mid-range product,Regular day,Regular day,Regular day,Most orders,Most orders,4,New customer,Low spender,Regular customer
32434209,80148,Female,New York,55,2020-04-01,1,married,57095,2859858,3,...,Low-range product,Regular day,Regular day,Regular day,Most orders,Most orders,4,New customer,Low spender,Regular customer
32434210,80148,Female,New York,55,2020-04-01,1,married,57095,3209855,4,...,Low-range product,Regular day,Regular day,Regular day,Average orders,Average orders,4,New customer,Low spender,Regular customer


In [62]:
ords_prods_deps_cust.drop(columns=['order_time_of_day'], inplace=True)

In [65]:
# Display the data frame
ords_prods_deps_cust

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,family_status,income,order_id,order_number,...,final_merge_indicator,price_range,busiest_day,busiest_days,slowest_days,busiest_period_of_day,max_order,loyalty_flag,spending_flag,order_frequency_flag
0,26711,Female,Missouri,48,2017-01-01,3,married,165665,518967,1,...,both,Low-range product,Regular day,Regular day,Regular day,Average orders,8,New customer,Low spender,Regular customer
1,26711,Female,Missouri,48,2017-01-01,3,married,165665,423547,2,...,both,Mid-range product,Regular day,Regular day,Regular day,Average orders,8,New customer,Low spender,Regular customer
2,26711,Female,Missouri,48,2017-01-01,3,married,165665,2524893,3,...,both,Mid-range product,Regular day,Regular day,Slowest days,Most orders,8,New customer,Low spender,Regular customer
3,26711,Female,Missouri,48,2017-01-01,3,married,165665,2524893,3,...,both,Low-range product,Regular day,Regular day,Slowest days,Most orders,8,New customer,Low spender,Regular customer
4,26711,Female,Missouri,48,2017-01-01,3,married,165665,2524893,3,...,both,Mid-range product,Regular day,Regular day,Slowest days,Most orders,8,New customer,Low spender,Regular customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434207,80148,Female,New York,55,2020-04-01,1,married,57095,2859858,3,...,both,Mid-range product,Regular day,Regular day,Regular day,Most orders,4,New customer,Low spender,Regular customer
32434208,80148,Female,New York,55,2020-04-01,1,married,57095,2859858,3,...,both,Mid-range product,Regular day,Regular day,Regular day,Most orders,4,New customer,Low spender,Regular customer
32434209,80148,Female,New York,55,2020-04-01,1,married,57095,2859858,3,...,both,Low-range product,Regular day,Regular day,Regular day,Most orders,4,New customer,Low spender,Regular customer
32434210,80148,Female,New York,55,2020-04-01,1,married,57095,3209855,4,...,both,Low-range product,Regular day,Regular day,Regular day,Average orders,4,New customer,Low spender,Regular customer



# 6. Exporting Data Frame


In [68]:
# Export ords_prods_deps_cust pickle
ords_prods_deps_cust.to_pickle(os.path.join('/Users', 'xxx', 'Documents', 'Instacart Basket Analysis - 2025-01-05', '02 - Data','Prepared Data', 'ords_prods_deps_cust.pkl'))