# Mall Customers
- Andrea Cohen
- 04.25.2023

## Task:
- To segment the customers based on age, gender, and interest.

## Data Source:
- https://www.kaggle.com/datasets/shwetabh123/mall-customers

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Load and inspect the data

In [2]:
# load the dataset and check for unnecessary columns
df = pd.read_csv('Data/Mall_Customers.csv', index_col = 'CustomerID')
display(df.head())
display(df.info())

Unnamed: 0_level_0,Genre,Age,Annual Income (k$),Spending Score (1-100)
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Genre                   200 non-null    object
 1   Age                     200 non-null    int64 
 2   Annual Income (k$)      200 non-null    int64 
 3   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 7.8+ KB


None

In [3]:
df.shape

(200, 4)

- There are 200 rows and 4 columns.

In [4]:
# check datatypes
df.dtypes

Genre                     object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object

- Genre is datatype object.
- Age, Annual Income (k$), and Spending Score (1-100) are all datatype int64.

In [5]:
# check for outliers and obvious errors
df.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


- There are no obvious outliers or errors in the data.

In [6]:
# check for duplicated rows
df.duplicated().sum()

0

- There are 0 duplicates.

In [7]:
# check for missing values
df.isna().sum()

Genre                     0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

- There are 0 missing values.

In [8]:
#check for inconsistencies in categorical data
df['Genre'].value_counts()

Female    112
Male       88
Name: Genre, dtype: int64

- There are no inconsistencies in the categorical data.

## One-Hot Encode the categorical data

In [9]:
#make column selector
cat_selector = make_column_selector(dtype_include = 'object')
#select categorical columns
cat_data = df[cat_selector(df)]
#instantiate one hot encoder
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
#fit and transform categorical data
df_ohe = ohe.fit_transform(cat_data)
#extract column names
ohe_column_names = ohe.get_feature_names_out(cat_data.columns)
#convert to a dataframe
ohe_df = pd.DataFrame(df_ohe, columns = ohe_column_names)
ohe_df.head()

Unnamed: 0,Genre_Female,Genre_Male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


## Scale the numeric data

In [10]:
#make column selector
num_selector = make_column_selector(dtype_include = 'number')
#select numeric columns
num_data = df[num_selector(df)]
#save column names
num_cols = num_selector(df)
#instantiate the standard scaler
scaler = StandardScaler()
#fit and transform numeric data
df_scaled = scaler.fit_transform(num_data)
#convert to a dataframe
scaled_df = pd.DataFrame(df_scaled, columns = num_cols)
scaled_df.head()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
0,-1.424569,-1.738999,-0.434801
1,-1.281035,-1.738999,1.195704
2,-1.352802,-1.70083,-1.715913
3,-1.137502,-1.70083,1.040418
4,-0.563369,-1.66266,-0.39598


## Concatenate the transformed dataframes

In [12]:
df_processed = pd.concat([scaled_df, ohe_df], axis=1)
display(df_processed.head())
display(df_processed.info())

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Genre_Female,Genre_Male
0,-1.424569,-1.738999,-0.434801,0.0,1.0
1,-1.281035,-1.738999,1.195704,0.0,1.0
2,-1.352802,-1.70083,-1.715913,1.0,0.0
3,-1.137502,-1.70083,1.040418,1.0,0.0
4,-0.563369,-1.66266,-0.39598,1.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     200 non-null    float64
 1   Annual Income (k$)      200 non-null    float64
 2   Spending Score (1-100)  200 non-null    float64
 3   Genre_Female            200 non-null    float64
 4   Genre_Male              200 non-null    float64
dtypes: float64(5)
memory usage: 7.9 KB


None