# 5.針對顧客的重要特徵分群，找出2~3群最有特色的顧客，並解釋其價值與意義。

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.cluster import KMeans ,DBSCAN
from sklearn import cluster,metrics
from mlxtend.frequent_patterns import apriori ,association_rules

sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
plt.rcParams['axes.unicode_minus'] = False # 正常顯示負號

In [None]:
# Colab 進行matplotlib繪圖時顯示繁體中文
# 下載台北思源黑體並命名taipei_sans_tc_beta.ttf，移至指定路徑
!wget -O TaipeiSansTCBeta-Regular.ttf https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager

# 改style要在改font之前
# plt.style.use('seaborn')

fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
mpl.rc('font', family='Taipei Sans TC Beta')

# EDA

In [None]:
df = pd.read_csv('/content/drive/MyDrive/大三/上學期/大數據決策/期末報告/customer_data_handled.csv')

df

# Processing Data

In [None]:
# 決定針對顧客的總收入和推薦次數來進行分析

x = df[['總收入', '推薦次數']]

# 評估指標

# 輪廓係數(Silhouette Evaluation)

輪廓係數的值越接近1，表示聚類的效果越好

In [None]:
silhouette_avg = []
for i in range(2,9):
    kmeans_fit = KMeans(n_clusters=i, init ='k-means++', max_iter=300,  n_init=10,random_state=0).fit(x)
    silhouette_avg.append(metrics.silhouette_score(x, kmeans_fit.labels_))
print(f"輪廓係數 = {silhouette_avg}")

plt.plot(range(2, 9), silhouette_avg, 'bx-')
plt.title('silhouette')
plt.xlabel('No of clusters')
plt.ylabel('Avg')
plt.show()

# WCSS 損失函數 (within-cluster sum of squares)

Elbow method，手肘法
透過嘗試多種類別個數，並將相應的 WCSS 記錄下來並且畫出，理想上最佳的K值應該是最大轉折處

In [None]:
wcss = []
for i in range(1, 9):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42).fit(x)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 9), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('No of clusters')
plt.ylabel('WCSS')
# 在每個點上標示相應的群數
for k, w in zip(range(1, 9), wcss):
    plt.text(k, w, f'({k},{int(w)})', ha='center', va='bottom')
plt.show()

結合上面的輪廓係數評估，我們認為聚類數量為3是一個合適的選擇

# KMeans Clustering

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(x)

# 將分群結果加入到原始資料中
df['Income_Recommend_Cluster'] = clusters

df.head()

In [None]:
for i in range(k):
    x=df[df['Income_Recommend_Cluster']==i]['總收入'].mean()
    y=df[df['Income_Recommend_Cluster']==i]['推薦次數'].mean()
    print(f'第{i+1}群 平均收入:{x} 平均推薦次數:{y}')
for i in range(k):
    x=df[df['Income_Recommend_Cluster']==i]['總收入'].max()
    y=df[df['Income_Recommend_Cluster']==i]['推薦次數'].max()
    print(f'第{i+1}群 最大收入:{x} 最大推薦次數:{y}')

# Plot

In [None]:
plt.figure(figsize=(8, 6))
for cluster_id in range(k):
    cluster_data = df[df['Income_Recommend_Cluster'] == cluster_id]
    plt.scatter(cluster_data['總收入'], cluster_data['推薦次數'], label=f'Cluster {cluster_id + 1}')
plt.xlabel('總收入')
plt.ylabel('推薦次數')
plt.title('Clusters of Customers')
plt.legend()
plt.show()

圖表得知第三群的總收入和總推薦次數是最多的

相較於前兩群，第三群群體中的顧客有高平均收入且有高推薦次數。因此可以得知第三群是最值得推銷的客群