### Step 3:

In [13]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [14]:
#import the orders_products_combined
ords_prods_merge = pd.read_pickle(...\02 Data\Prepared Data\orders_products_combined.pkl')
path = ...'
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [15]:
#import the customer data
customers = pd.read_csv(...\02 Data\Original Data\customers.csv')
path = ...'
customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

### Step 4 & 5:

In [16]:
# Wrangling customers dataset to ensure they are clean, logical, and consistent for analysis.

# 1. View the structure of the customers dataset
customers.head()  # Display the first few rows of the dataset
customers.info()  # Show column names, data types, and null values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [17]:
# 2. Check for missing values
# Identify if any columns contain missing values and determine how to handle them (e.g., fill or drop).
customers.isnull().sum()  # Check for missing values in each column

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [18]:
# 3. Check for duplicate rows
customers.duplicated().sum()  # Count duplicate rows

0

In [19]:
# 4. Rename columns for clarity
customers.rename(columns={'First Name': 'first_name'}, inplace=True)
customers.rename(columns={'Surnam': 'last_name'}, inplace=True)
customers.rename(columns={'Gender': 'gender'}, inplace=True)
customers.rename(columns={'STATE': 'state'}, inplace=True)
customers.rename(columns={'Age': 'age'}, inplace=True)
customers.rename(columns={'n_dependants': 'num_dependants'}, inplace=True)
customers.rename(columns={'fam_status': 'civil_status'}, inplace=True)

In [20]:
# Replace NaN values in 'first_name' with a placeholder
customers['first_name'] = customers['first_name'].fillna('Unknown')

In [21]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   user_id         206209 non-null  int64 
 1   first_name      206209 non-null  object
 2   last_name       206209 non-null  object
 3   gender          206209 non-null  object
 4   state           206209 non-null  object
 5   age             206209 non-null  int64 
 6   date_joined     206209 non-null  object
 7   num_dependants  206209 non-null  int64 
 8   civil_status    206209 non-null  object
 9   income          206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [22]:
# 5. Check for mixed data types in each column
for col in customers.columns:
    print(f"{col}: {customers[col].apply(type).value_counts()}")

user_id: user_id
<class 'int'>    206209
Name: count, dtype: int64
first_name: first_name
<class 'str'>    206209
Name: count, dtype: int64
last_name: last_name
<class 'str'>    206209
Name: count, dtype: int64
gender: gender
<class 'str'>    206209
Name: count, dtype: int64
state: state
<class 'str'>    206209
Name: count, dtype: int64
age: age
<class 'int'>    206209
Name: count, dtype: int64
date_joined: date_joined
<class 'str'>    206209
Name: count, dtype: int64
num_dependants: num_dependants
<class 'int'>    206209
Name: count, dtype: int64
civil_status: civil_status
<class 'str'>    206209
Name: count, dtype: int64
income: income
<class 'int'>    206209
Name: count, dtype: int64


### Step 6:

In [15]:
# 1. Check the data
ords_prods_merge.head()
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30328763 entries, 5 to 32406040
Data columns (total 20 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   order_id                       int64   
 1   user_id                        int64   
 2   order_number                   int64   
 3   orders_day_of_week             int32   
 4   order_hour_of_day              int32   
 5   days_since_prior_order         float64 
 6   product_id                     int64   
 7   add_to_cart_order              int64   
 8   reordered                      int64   
 9   product_name                   category
 10  aisle_id                       int64   
 11  department_id                  int64   
 12  prices                         float64 
 13  busiest_day                    category
 14  day_summary                    category
 15  busiest_period_of_day          category
 16  loyalty_flag                   category
 17  spending_flag                 

In [23]:
# 2. Merge ords_prods_merge with customers on the 'user_id' column
df_merged = pd.merge(ords_prods_merge, customers, on='user_id', how='left', indicator=True)

In [24]:
# 3. Drop the '_merge' column
df_merged.drop(columns=['_merge'], inplace=True)

In [25]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30328763 entries, 0 to 30328762
Data columns (total 29 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   order_id                       int64   
 1   user_id                        int64   
 2   order_number                   int64   
 3   orders_day_of_week             int32   
 4   order_hour_of_day              int32   
 5   days_since_prior_order         float64 
 6   product_id                     int64   
 7   add_to_cart_order              int64   
 8   reordered                      int64   
 9   product_name                   category
 10  aisle_id                       int64   
 11  department_id                  int64   
 12  prices                         float64 
 13  busiest_day                    category
 14  day_summary                    category
 15  busiest_period_of_day          category
 16  loyalty_flag                   category
 17  spending_flag            

In [26]:
# Export the cleaned DataFrame to a pickle file for further use
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'instacart_data.pkl'))