<a href="https://colab.research.google.com/github/awaaat/Machine_learning-Deep_learning/blob/main/Clustering_Survey_Centroid_%2C_Connectivity_and_Density_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler

from yellowbrick.cluster import KElbowVisualizer
from matplotlib import pyplot as plt

import matplotlib.cm as cm
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')


In [None]:
dataset = pd.read_csv("https://drive.google.com/uc?export=download&id=1Ih40gYhUN6gHWCNJoPW6n-YcJNhVwmx4", sep=",")
dataset.head()

In [None]:
dataset.info()

In [None]:
plt.figure(figsize=(10, 9))
sns.scatterplot(data=dataset, x='Annual Income (k$)', y='Spending Score (1-100)', alpha=0.9)

In [None]:
# get feature columns and convert to np.ndarray
data_x = dataset.iloc[:, 3:5]
x_array =  np.array(data_x)
print(x_array)

In [None]:
# feature scaling
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x_array)
x_scaled

# **Centroid-based: K Means Clustering**

In [None]:
# elbow method for finding optimal k for k-means

sum_of_squared_distances=[]
K = range(1,15)

for k in K:
    km = KMeans(n_clusters=k, n_init=10)
    km = km.fit(x_scaled)
    sum_of_squared_distances.append(km.inertia_)

plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('SSE')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# silhoutte score plot
model = KMeans(random_state=123, n_init=10)
visualizer = KElbowVisualizer(model, k=(2,8), metric='silhouette', timings=False)

visualizer.fit(x_scaled)
visualizer.poof()

In [None]:
numerics = dataset[['Annual Income (k$)','Spending Score (1-100)']]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in numerics:
        scaler.fit(dataset[[i]])
        dataset[i] = scaler.transform(dataset[[i]])
km = KMeans(n_clusters=5, n_init=10)
y_predicted = km.fit_predict(dataset[['Annual Income (k$)', 'Spending Score (1-100)']])
y_predicted

In [None]:
dataset["Cluster"] = y_predicted
dataset.head(10)

In [None]:
# show coordinates of all centroids
km.cluster_centers_

In [None]:
plt.figure(figsize=(12,8))
df1 = dataset[dataset.Cluster==0]
df2 = dataset[dataset.Cluster==1]
df3 = dataset[dataset.Cluster==2]
df4 = dataset[dataset.Cluster==3]
df5 = dataset[dataset.Cluster==4]
plt.scatter(df1['Annual Income (k$)'],df1['Spending Score (1-100)'],color='green', label='Target Group')
plt.scatter(df2['Annual Income (k$)'],df2['Spending Score (1-100)'],color='magenta', label='Sensible')
plt.scatter(df3['Annual Income (k$)'],df3['Spending Score (1-100)'],color='orange', label='Careless')
plt.scatter(df4['Annual Income (k$)'],df4['Spending Score (1-100)'],color='red', label='Careful')
plt.scatter(df5['Annual Income (k$)'],df5['Spending Score (1-100)'],color='blue', label='Standard')
plt.title('Clustering Result', fontweight='bold',fontsize=20)
plt.xlabel('Annual Income (scaled)',fontsize=15)
plt.ylabel('Spending Score (scaled)',fontsize=15)
plt.legend(fontsize=15)
plt.grid(True)
plt.show()

# **Connectivity-based: Hierarchical Clustering**

In [None]:
# same dataset as above

plt.figure(figsize=(8,5))
plt.title("Annual income distribution",fontsize=15)
plt.xlabel ("Annual income (scaled)",fontsize=13)
plt.grid(True)
plt.hist(dataset['Annual Income (k$)'],color='blue',edgecolor='k')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.title("Spending Score distribution",fontsize=15)
plt.xlabel ("Spending Score (scaled)",fontsize=14)
plt.grid(True)
plt.hist(dataset['Spending Score (1-100)'],color='brown',edgecolor='k')
plt.show()

In [None]:
plt.figure(figsize=(11,8))
plt.title("Annual Income and Spending Score Correlation",fontsize=18)
plt.xlabel ("Annual Income (scaled)",fontsize=14)
plt.ylabel ("Spending Score (scaled)",fontsize=14)
plt.grid(True)
plt.scatter(dataset['Annual Income (k$)'], dataset['Spending Score (1-100)'],color='green',edgecolor='k',alpha=0.6, s=100)
plt.show()

In [None]:
# We use the 'Ward' linkage criterion (https://en.wikipedia.org/wiki/Ward%27s_method)

X = dataset.iloc[:,[3,4]].values

import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster import hierarchy

plt.figure(figsize=(17,10))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')

dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.show()

In [None]:
plt.figure(figsize=(15,6))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.text(x=50, y=1.6, s='Horizontal line crossing 5 vertical lines', fontsize=12)
plt.axhline(y=1.5, c='grey', lw=3, linestyle='dashed')
dendogram = sch.dendrogram(linkage(X, method='ward'))

plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

hc = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(X)

plt.figure(figsize=(15,10))

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'green', label = 'Target group')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'red', label = 'Careful')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Sensible')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'orange', label = 'Careless')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Standard')

plt.title('Clustering Result', fontweight='bold',fontsize=30)
plt.xlabel('Annual Income (scaled)',fontsize=20)
plt.ylabel('Spending Score (scaled)',fontsize=20)
plt.legend(fontsize=14, loc='center right', frameon=True, shadow=True)
plt.grid(True)

plt.axhspan(ymin=.61,ymax=1,xmin=0.43,xmax=0.96,alpha=0.3,color='yellow')
plt.show()

# **Density-based: DBSCAN**
Density Based Spatial Clustering for Applications with Noise (DBSCAN)

**Hyperparameters:**
- **eps**: specifies how close points should be to each other to be considered a part of a cluster. It means that if the distance between two points is lower or equal to this value (eps), these points are considered to be neighbors.
- **minPoints**: the minimum number of data points to form a dense region/ cluster. For example, if we set the minPoints parameter as 5, then we need at least 5 points to form a dense region.

In [None]:
from sklearn.cluster import DBSCAN
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load, convert, and scale features
df = dataset[['Annual Income (k$)','Spending Score (1-100)']]
X = np.nan_to_num(df)
X = np.array(X, dtype=np.float64)
X = StandardScaler().fit_transform(X)

# Compute DBSCAN
db = DBSCAN(eps=0.4, min_samples=5).fit(X)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
df['Cluster']=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))

# A sample of clusters
print(dataset[['Annual Income (k$)','Spending Score (1-100)']].head())
# Number of Labels
print("number of labels: ", set(labels))

In [None]:
# Visualize clusters

plt.figure(figsize=(15,10))
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]

for k, col in zip(unique_labels, colors):
  # black removed, used for noise instead
  if k == -1:
    # Black used for noise.
    col = [0, 0, 0, 1]

class_member_mask = ~(labels == k)

In [None]:
plt.figure(figsize=(15,10))

xy = X[class_member_mask & core_samples_mask]

masked_y_hc = y_hc[class_member_mask & core_samples_mask]

plt.scatter(xy[masked_y_hc == 0, 0], xy[masked_y_hc == 0, 1], s = 60, c = 'green', label = 'Target group')
plt.scatter(xy[masked_y_hc == 2, 0], xy[masked_y_hc == 2, 1], s = 60, c = 'red', label = 'Careful')
plt.scatter(xy[masked_y_hc == 4, 0], xy[masked_y_hc == 4, 1], s = 60, c = 'magenta', label = 'Sensible')
plt.scatter(xy[masked_y_hc == 3, 0], xy[masked_y_hc == 3, 1], s = 60, c = 'orange', label = 'Careless')
plt.scatter(xy[masked_y_hc == 1, 0], xy[masked_y_hc == 1, 1], s = 60, c = 'blue', label = 'Standard')

xy = X[class_member_mask & ~core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], '.', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6, label='Noise')

plt.title('Clustering of Customers, Estimated Number of Clusters: %d' % realClusterNum, fontweight='bold',fontsize=20)
plt.xlabel('Annual Income',fontsize=20)
plt.ylabel('Spending Score',fontsize=20)
plt.legend(fontsize=14, loc='center right', frameon=True, shadow=True)
plt.show()

n_noise_ = list(labels).count(-1)
print('number of noise(s): ', n_noise_)