Dataset: [Customer Personality Analysis](https://drive.google.com/file/d/19TUlAkMBRQi4MKfimeYBxCrFSeYk0ZGr/view?usp=sharing)

## Importing Libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
import warnings
import inflection

from matplotlib import rcParams
rcParams['figure.figsize'] = (12, 5)
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['axes.labelsize'] = 14
rcParams['axes.titlesize'] = 14
sns.set_style('whitegrid')

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from tqdm.auto import tqdm

nK = 10
SEED = 10

COLOR = '#319DA0'

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')



def load_dataset(path):
    df = pd.read_csv(path)
    return df

# Task 2 - Data Cleaning & Preprocessing

**Tahapan Pengerjaan**
1. Mengatasi data null dan duplicated
2. Membuang data yang tidak diperlukan
3. Melakukan feature encoding
4. Melakukan standardisasi pada feature

## Handling Missing Value

In [25]:
total_null = pd.DataFrame(df.isna().sum()).T.rename({0:'total_null'})
percentage_null = pd.DataFrame(100 * df.isna().sum() / df.shape[0]).T.rename({0:'percentage_null'})
data_null = pd.concat([total_null, percentage_null]).T
mask = data_null['total_null'] != 0
data_null.loc[mask].sort_values('total_null', ascending=True)

Unnamed: 0,total_null,percentage_null
income,24.0,1.071429


We have null values in Income column, with 24 rows (1.07%). we will remove it

In [26]:
df = df.dropna()

## Handling Duplicated Data
There is no duplicated data

In [27]:
df.duplicated().sum()

0

## Feature Encoding

In [28]:
# Parsing categorical columns into integer
def encode(df):
    
    df = df.copy()
    
    def edu(enc):
        if enc == 'SMA':
            return 1
        elif enc == 'D3':
            return 2
        elif enc == 'S1':
            return 3
        elif enc == 'S2':
            return 4
        else:
            return 5
    df['education'] = df['education'].apply(edu)
    
    def marsta(sta):
        if sta == 'lajang':
            return 1
        elif sta == 'bertunangan':
            return 2
        elif sta == 'menikah':
            return 3
        elif sta == 'cerai':
            return 4
        elif sta == 'janda':
            return 5
        else:
            return 6
    df['marital_status'] = df['marital_status'].apply(marsta)

    def age_map(age):
        if age == 'child':
            return 1
        elif age == 'teens':
            return 2
        elif age == 'mature':
            return 3
        elif age == 'middle_aged':
            return 4
        else:
            return 5
    df['map_age'] = df['map_age'].apply(age_map)
    
    return df

df = encode(df)
df.head()

Unnamed: 0,id,year_birth,age,map_age,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mnt_coke,mnt_fruits,mnt_meat_products,mnt_fish_products,mnt_sweet_products,mnt_gold_prods,num_deals_purchases,num_web_purchases,num_catalog_purchases,num_store_purchases,num_web_visits_month,accepted_cmp3,accepted_cmp4,accepted_cmp5,accepted_cmp1,accepted_cmp2,complain,z_cost_contact,z_revenue,response,join_at_age,total_children,is_parents,total_spent,total_accepted_camp,total_transaction,cvr
0,5524,1957,65,5,3,6,58138000.0,0,0,2012-04-09,58,635000,88000,546000,172000,88000,88000,3,8,10,4,7,0,0,0,0,0,0,3,11,1,55,0,0,1617000,0,25,3.571429
1,2174,1954,68,5,3,6,46344000.0,1,1,2014-08-03,38,11000,1000,6000,2000,1000,6000,2,1,1,2,5,0,0,0,0,0,0,3,11,0,60,2,1,27000,0,6,1.2
2,4141,1965,57,5,3,6,71613000.0,0,0,2013-08-21,26,426000,49000,127000,111000,21000,42000,1,8,2,10,4,0,0,0,0,0,0,3,11,0,48,0,0,776000,0,21,5.25
3,6182,1984,38,4,3,6,26646000.0,1,0,2014-10-02,26,11000,4000,20000,10000,3000,5000,2,2,0,4,6,0,0,0,0,0,0,3,11,0,30,1,1,53000,0,8,1.333333
4,5324,1981,41,4,5,6,58293000.0,1,0,2014-01-19,94,173000,43000,118000,46000,27000,15000,5,5,3,6,5,0,0,0,0,0,0,3,11,0,33,1,1,422000,0,19,3.8


## Feature Standardization

In [29]:
def data_scaling(df, numerical, scaler):
    df[numerical] = scaler.fit_transform(df[numerical])
    return df[numerical]

df_sum = df.copy()
df_scaled = df.copy()

df_scaled[numerical] = data_scaling(df, numerical, scaler=StandardScaler())
df_scaled.head(3)

Unnamed: 0,id,year_birth,age,map_age,education,marital_status,income,kidhome,teenhome,dt_customer,recency,mnt_coke,mnt_fruits,mnt_meat_products,mnt_fish_products,mnt_sweet_products,mnt_gold_prods,num_deals_purchases,num_web_purchases,num_catalog_purchases,num_store_purchases,num_web_visits_month,accepted_cmp3,accepted_cmp4,accepted_cmp5,accepted_cmp1,accepted_cmp2,complain,z_cost_contact,z_revenue,response,join_at_age,total_children,is_parents,total_spent,total_accepted_camp,total_transaction,cvr
0,5524,1957,0.986443,5,3,6,0.234063,-0.823039,-0.928972,2012-04-09,0.310532,0.978226,1.549429,1.690227,2.454568,1.484827,0.850031,0.351713,1.428553,2.504712,-0.554143,0.693232,0,0,0,0,0,0,0.0,0.0,1,0.897752,-1.264803,0,1.675488,0,25,3.571429
1,2174,1954,1.236801,5,3,6,-0.234559,1.039938,0.909066,2014-08-03,-0.380509,-0.872024,-0.637328,-0.717986,-0.651038,-0.63388,-0.732867,-0.168231,-1.125881,-0.571082,-1.169518,-0.131574,0,0,0,0,0,0,0.0,0.0,0,1.313688,1.405806,1,-0.962358,0,6,1.2
2,4141,1965,0.318822,5,3,6,0.769478,-0.823039,-0.928972,2013-08-21,-0.795134,0.358511,0.569159,-0.178368,1.340203,-0.146821,-0.037937,-0.688176,1.428553,-0.229327,1.291982,-0.543978,0,0,0,0,0,0,0.0,0.0,0,0.315443,-1.264803,0,0.28025,0,21,5.25


## Drop Unused Features

In [30]:
print(f'before drop redundant features {df_scaled.shape[1]}')

#Dropping some of the redundant features
to_drop = ['id', 'year_birth', 'map_age', 'education', 'marital_status', 'dt_customer', 'z_cost_contact', 'z_revenue',]
X = df_scaled.drop(to_drop, axis=1).copy()

print(f'after drop redundant features {X.shape[1]}')

before drop redundant features 38
after drop redundant features 30
