In [12]:
import pandas as pd
import numpy as np
import os


# Load customer data
customers = pd.read_csv('/Users/cem/Desktop/Data Immersion/Achievement 4_Project/Data/customers.csv')

# Load the main merged dataset
ords_prods_merge = pd.read_pickle('/Users/cem/Desktop/Data Immersion/Achievement 4_Project/Data/Prepared Data/ords_prods_merged_labeled.pkl')


customers.head()
customers.info()
customers.describe()
customers.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [13]:
customers.rename(columns={
    'First Name': 'first_name',
    'Surnam': 'last_name',  # assuming 'Surnam' is a typo of 'Surname'
    'Gender': 'gender',
    'STATE': 'state',
    'Age': 'age',
    'date_joined': 'date_joined',
    'n_dependants': 'num_dependents',
    'fam_status': 'marital_status',
    'income': 'income'
}, inplace=True)


customers.drop(columns=['first_name', 'last_name', 'date_joined'], inplace=True)

print(customers.columns)

Index(['user_id', 'gender', 'state', 'age', 'num_dependents', 'marital_status',
       'income'],
      dtype='object')


## Wrangling Customer Data

I decided to clean the customer dataset by:
- Renaming inconsistent or unclear columns (First Name, Surnam, STATE, etc.) to standard lowercase names.
- Dropping unnecessary columns that I think donâ€™t add value to the analysis (first_name, last_name, date_joined).
- Ensuring consistent naming conventions (snake_case) across the entire project.

This prepares the customer data for merging with the transaction-level dataset.


In [14]:
## Q5
customers.isnull().sum()


user_id           0
gender            0
state             0
age               0
num_dependents    0
marital_status    0
income            0
dtype: int64

In [15]:
## checking for duplicates
customers.duplicated().sum()


np.int64(0)

In [16]:
customers = customers.drop_duplicates()
customers.dtypes


user_id            int64
gender            object
state             object
age                int64
num_dependents     int64
marital_status    object
income             int64
dtype: object

In [17]:
print(customers['gender'].value_counts())
print(customers['marital_status'].value_counts())
print(customers['state'].value_counts())


gender
Male      104067
Female    102142
Name: count, dtype: int64
marital_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: count, dtype: int64
state
Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts     

## Data Quality Checks on Customer Data

I checked the customers dataset for:
- **Missing values** using `.isnull().sum()`
- **Duplicate rows** using `.duplicated().sum()`
- **Correct data types** using `.dtypes`
- **Inconsistent categorical entries** in `gender`, `state`, and `marital_status`

All issues were addressed to ensure the data is clean and ready for merging.


In [18]:
## Q6: Checking the dtypes
print(ords_prods_merge['user_id'].dtype)
print(customers['user_id'].dtype)


int64
int64


In [19]:
## Merging two dataframes
ords_prods_merged_final = pd.merge(ords_prods_merge, customers, on='user_id', how='left')

ords_prods_merged_final.head()
ords_prods_merged_final.shape
ords_prods_merged_final[['user_id', 'gender', 'age', 'income']].sample(5)


Unnamed: 0,user_id,gender,age,income
11780905,74570,Male,57,125083
14705658,93101,Male,62,126907
26492706,168446,Male,44,145092
14023620,88797,Male,56,109013
19138114,121180,Male,41,159032


In [None]:
## Exporting dataset as pickle

output_path = '/Users/cem/Desktop/Data Immersion/Achievement 4_Project/Data/Prepared Data/ords_prods_merged_final.pkl'

ords_prods_merged_final.to_pickle(output_path)
