# KMeans Clustering

In [None]:
# Library and Data Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

file_path = '/content/drive/MyDrive/INFO-614 Data Mining/HW4/card.csv'
df = pd.read_csv(file_path)

df.head()

In [None]:
# Attributes Selection
data = df[['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY']]

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)

In [None]:
# K-Means
from sklearn.cluster import KMeans

k = 4
model = KMeans(n_clusters = k, random_state = 15)
model.fit(data_scale)
df['cluster'] = model.fit_predict(data_scale)

In [None]:
# Visualization
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 8))

for i in range(k):
    plt.scatter(df.loc[df['cluster'] == i, 'BALANCE_FREQUENCY'], df.loc[df['cluster'] == i, 'PURCHASES_FREQUENCY'], 
                label = 'cluster ' + str(i))

plt.legend()
plt.title('K = %d results'%k , size = 15)
plt.xlabel('BALANCE_FREQUENCY', size = 12)
plt.ylabel('PURCHASES_FREQUENCY', size = 12)
plt.show()

In [None]:
# Applying Elbow Method to Find Optimal K

from yellowbrick.cluster import KElbowVisualizer

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))
visualizer.fit(data_scale)

In [None]:
# KMeans Clustering Validation using Silhouette Coefficient

from yellowbrick.cluster import SilhouetteVisualizer

k = 3
model = KMeans(n_clusters = k, init = 'k-means++', random_state = 10)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(data_scale)      
visualizer.show()

k = 4
model = KMeans(n_clusters = k, init = 'k-means++', random_state = 10)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(data_scale)
visualizer.show()

k = 5
model = KMeans(n_clusters = k, init = 'k-means++', random_state = 10)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(data_scale)
visualizer.show()

k = 6
model = KMeans(n_clusters = k, init = 'k-means++', random_state = 10)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(data_scale)
visualizer.show()

In [None]:
# Optimized K Value
k = 3
model = KMeans(n_clusters = k, random_state = 10)
model.fit(data_scale)
df['cluster'] = model.fit_predict(data_scale)

In [None]:
# Visualizing 
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 8))

for i in range(k):
    plt.scatter(df.loc[df['cluster'] == i, 'BALANCE_FREQUENCY'], df.loc[df['cluster'] == i, 'PURCHASES_FREQUENCY'], 
                label = 'cluster ' + str(i))

plt.legend()
plt.title('K = %d results'%k , size = 15)
plt.xlabel('BALANCE_FREQUENCY', size = 12)
plt.ylabel('PURCHASES_FREQUENCY', size = 12)
plt.show()

In [None]:
# Trying Another K-Means Model

k = 5
model = KMeans(n_clusters = k, random_state = 10)
model.fit(data_scale)
df['cluster'] = model.fit_predict(data_scale)

In [None]:
# Visualizing 
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 8))

for i in range(k):
    plt.scatter(df.loc[df['cluster'] == i, 'BALANCE_FREQUENCY'], df.loc[df['cluster'] == i, 'PURCHASES_FREQUENCY'], 
                label = 'cluster ' + str(i))

plt.legend()
plt.title('K = %d results'%k , size = 15)
plt.xlabel('BALANCE_FREQUENCY', size = 12)
plt.ylabel('PURCHASES_FREQUENCY', size = 12)
plt.show()

In [None]:
# Trying Another K-Means Model

k = 6
model = KMeans(n_clusters = k, random_state = 10)
model.fit(data_scale)
df['cluster'] = model.fit_predict(data_scale)

In [None]:
# Visualizing 
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 8))

for i in range(k):
    plt.scatter(df.loc[df['cluster'] == i, 'BALANCE_FREQUENCY'], df.loc[df['cluster'] == i, 'PURCHASES_FREQUENCY'], 
                label = 'cluster ' + str(i))

plt.legend()
plt.title('K = %d results'%k , size = 15)
plt.xlabel('BALANCE_FREQUENCY', size = 12)
plt.ylabel('PURCHASES_FREQUENCY', size = 12)
plt.show()

# Hierarchical Clustering

In [None]:
# Attributes Selection
data = df[['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY']]

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)
X = data_scale

In [None]:
# Agglomerative Clustering (Max)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'complete')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Agglomerative Clustering (Min)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'single')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Agglomerative Clustering (Average)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'average')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Validation using Silhouette Coefficient
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'complete')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

In [None]:
model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'single')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

In [None]:
model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'average')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

# Alternative Try for Hierarchical Clustering with Different K Clusters

In [None]:
# Agglomerative Clustering With 4 Clusters (Max)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'complete')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Agglomerative Clustering With 4 Clusters (Min)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'single')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Agglomerative Clustering with 4 Clusters (Average)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'average')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Validation using Silhouette Coefficient
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

model = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'complete')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels)

In [None]:
model = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'single')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels)

In [None]:
model = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'average')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

In [None]:
# Agglomerative Clustering With 3 Clusters (Max)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'complete')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Agglomerative Clustering With 3 Clusters (Min)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'single')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Agglomerative Clustering With 3 Clusters (Average)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'average')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.title('Clusters of customers')
plt.xlabel('BALANCE_FREQUENCY')
plt.ylabel('PURCHASES_FREQUENCY')
plt.legend()
plt.show()

In [None]:
# Validation using Silhouette Coefficient
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'complete')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

In [None]:
model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'single')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

In [None]:
model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'average')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 

# Part 3 - Alternative Attribute Selection for Clustering

In [None]:
# Library and Data Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

file_path = '/content/drive/MyDrive/INFO-614 Data Mining/HW4/card.csv'
df = pd.read_csv(file_path)

df.head()

In [None]:
# Dataframe
data = df[['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']]

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)

In [None]:
# Correlation Matrix
import seaborn as sns
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True,cmap='Reds', fmt='.2f')

In [None]:
# Feature Subset
data = df[['PURCHASES', 'BALANCE']]

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)
X = data_scale

In [None]:
# Agglomerative Clustering (Max)
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'complete')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Purchase')
plt.ylabel('Balance')
plt.legend()
plt.show()

In [None]:
# Validation using Silhouette Coefficient
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'complete')
model.fit(data_scale)
cluster_labels = model.fit_predict(data_scale)
sample_silhouette_values = silhouette_samples(data_scale, cluster_labels)

metrics.silhouette_score(data_scale, cluster_labels) 