In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from cycler import cycler

%matplotlib inline

# Set color cycle for all plots
plt.rcParams["axes.prop_cycle"] = cycler(
    color=[plt.get_cmap("Set2")(i) for i in range(9)]
)

## Overview
1. load the data from df_train.csv
2. Extract the numerical features from the data (df_numeric)
3. Apply DBSCAN and compare the shilouette score
 - df_numeric
 - standardized df_numeric
 - scaled df_numeric
 - standardized and scaled df_numeric

## 1. load the data

In [2]:
from utils import get_df_description

# load df
df = pd.read_csv('../datasets/df_train.csv')

# load json file
description_json = json.load(open('df_train_description.json', 'r'))
description_json_df = pd.DataFrame(description_json)

# get df_description
df_description = get_df_description(df, description_json_df)
df_description

Unnamed: 0,column,dtype,missing_values,source,description
0,customer_unique_id,object,0,df_customers,PK
1,frequency,int64,0,Calculated,number of orders
2,repeater,int64,0,Calculated,"1:repeater, 0:non-repeater"
3,fo_order_id,object,0,df_orders,FK
4,fo_customer_id,object,0,df_orders,FK (first order's customer_id)
5,fo_order_status,object,0,df_orders,
6,fo_order_purchase_timestamp,object,0,df_orders,
7,fo_order_approved_at,object,0,df_orders,
8,fo_order_delivered_carrier_date,object,0,df_orders,
9,fo_order_delivered_customer_date,object,0,df_orders,


## 2. Prepare df_numeric

In [3]:
## Selecting numeric columns
df_numeric = df.select_dtypes(include=['number']).dropna()
print(df_numeric.shape)
df_numeric.head()

(3456, 12)


Unnamed: 0,frequency,repeater,customer_zip_code_prefix,recency,monetary,fo_payment_value,fo_is_daytime,fo_is_weekday,fo_delivery_delay_days,fo_review_score_mean,fo_voucher_payment_value,fo_voucher_used
19,1,0,72872,114,78.42,78.42,1,1,-15.0,3.0,69.89,1
55,1,0,13273,308,354.87,354.87,1,1,-6.0,3.0,293.54,1
73,1,0,77600,482,109.78,109.78,1,1,-17.0,5.0,31.73,1
165,1,0,3904,113,79.51,79.51,1,1,-21.0,5.0,79.51,1
232,1,0,13920,289,66.91,66.91,1,0,-20.0,4.0,43.92,1


## 3. Apply DBSCAN and compare the shilouette score
 - df_numeric
 - standardized df_numeric
 - scaled df_numeric
 - standardized and scaled df_numeric

In [9]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from utils import standardize, normalize

# DBSCANの初期設定例（epsとmin_samplesはデータに応じて調整）
dbscan = DBSCAN(eps=0.5, min_samples=5)

## 4-1. Original data
dbscan.fit(df_numeric)
label_nums_orig = len(set(dbscan.labels_))
if label_nums_orig > 1:
    silhouette_orig = silhouette_score(df_numeric, dbscan.labels_)
else:
    silhouette_orig = 'N/A'  # Not applicable

# 4-2. Standardized data
df_standardized = standardize(df_numeric)
dbscan.fit(df_standardized)
label_nums_std = len(set(dbscan.labels_))
if label_nums_std > 1:
    silhouette_std = silhouette_score(df_standardized, dbscan.labels_)
else:
    silhouette_std = 'N/A'

# 4-3. Normalized data
df_normalized = normalize(df_numeric)
dbscan.fit(df_normalized)
label_nums_norm = len(set(dbscan.labels_))
if label_nums_norm > 1:
    silhouette_norm = silhouette_score(df_normalized, dbscan.labels_)
else:
    silhouette_norm = 'N/A'

# 4-4. Standardized and normalized data
df_std_norm = normalize(df_standardized)
dbscan.fit(df_std_norm)
label_nums_std_norm = len(set(dbscan.labels_))
if label_nums_std_norm > 1:
    silhouette_std_norm = silhouette_score(df_std_norm, dbscan.labels_)
else:
    silhouette_std_norm = 'N/A'

In [10]:
## See results
print(f'Original Data Silhouette Score: {silhouette_orig}')
print(f'Standardized Data Silhouette Score: {silhouette_std}')
print(f'Normalized Data Silhouette Score: {silhouette_norm}')
print(f'Standardized and Normalized Data Silhouette Score: {silhouette_std_norm}')

## print the number of clusters
print(f'Original Data: {label_nums_orig} clusters')
print(f'Standardized Data: {label_nums_std} clusters')
print(f'Normalized Data: {label_nums_norm} clusters')
print(f'Standardized and Normalized Data: {label_nums_std_norm} clusters')

Original Data Silhouette Score: N/A
Standardized Data Silhouette Score: -0.345096769998792
Normalized Data Silhouette Score: 0.4651226870678635
Standardized and Normalized Data Silhouette Score: 0.4651226870678635
Original Data: 1 clusters
Standardized Data: 24 clusters
Normalized Data: 9 clusters
Standardized and Normalized Data: 9 clusters


## Conclusion
- Original Data Silhouette Score (N/A):
  - Indicates DBSCAN failed to form valid clusters with the original dataset, possibly due to inappropriate eps and min_samples settings for the data's characteristics.
- Standardized Data Silhouette Score (-0.3451):
  - A negative silhouette score suggests poor clustering quality, where intra-cluster distances are greater than inter-cluster distances, indicating standardization may not have been suitable for this dataset.
- Normalized Data Silhouette Score (0.4651) and Standardized and Normalized Data Silhouette Score (0.4651):
  - Both scenarios achieving the same positive silhouette score indicate significant improvement in clustering performance with normalized data. The identical scores suggest that, in this case, standardization post-normalization did not contribute additional effects to the clustering outcome.
- Reason for Identical Scores in the Third and Fourth Scenarios:
  - Implies that after normalization, which adjusts data scales to a range between 0 and 1, subsequent standardization did not impact the clustering result. This suggests that normalization alone was sufficient to enhance DBSCAN's performance for this dataset.
- Overall Insight:
  - The results underscore the substantial impact of data preprocessing on the performance of clustering algorithms, particularly DBSCAN in this context. It highlights the importance of selecting suitable preprocessing techniques to improve clustering outcomes, with normalization proving to be key in this instance.