In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('cc.csv')
df.head()

In [None]:
X = df.iloc[:,2:]

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(df[['Avg_Credit_Limit', 'Total_Credit_Cards', 'Total_visits_bank', 'Total_visits_online', 'Total_calls_made']])

scaled_df = pd.DataFrame(scaled_data, columns=['Avg_Credit_Limit', 'Total_Credit_Cards', 'Total_visits_bank', 'Total_visits_online', 'Total_calls_made'])

In [None]:
from sklearn.cluster import AgglomerativeClustering 
# default linkage is ward
clustering_average = AgglomerativeClustering(n_clusters=3,metric="euclidean",linkage='average')
y_pred = clustering_average.fit_predict(scaled_df)
# clustering_complete = AgglomerativeClustering(n_clusters=3,affinity="euclidean",linkage='complete')
# y_pred = clustering.fit_predict(X)
# clustering_single = AgglomerativeClustering(n_clusters=3,affinity="euclidean",linkage='single')
# y_pred = clustering.fit_predict(X)

In [None]:
clustering_average.labels_

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

plt.figure(figsize=(30, 6))
plt.title('Dendrogram')
plt.ylabel('Euclidean distance')
plt.xlabel('data points')
link_matrix = linkage(scaled_df, method='average')
dendrogram(link_matrix)

plt.show()

## WITHOUT USING SCIKIT

In [None]:
class Agglomerative_scratch:
    @staticmethod
    def custom_distance(point1, point2):
        return np.sqrt(np.sum((point1 - point2) ** 2))

    # distance between nearest points within the cluster
    def single_linkage(self,cluster1, cluster2):
        min_distance = float('inf')
        for point1 in cluster1:
            for point2 in cluster2:
                distance = self.custom_distance(point1, point2)
                if distance < min_distance:
                    min_distance = distance
        return min_distance

    # distance between farthest points in the cluster
    def complete_linkage(self,cluster1, cluster2):
        max_distance = -1
        for point1 in cluster1:
            for point2 in cluster2:
                distance = self.custom_distance(point1, point2)
                if distance > max_distance:
                    max_distance = distance
        return max_distance

    #average  of distance between every point of cluster1 to every other point in cluster2 
    def average_linkage(self,cluster1, cluster2):
        total_distance = 0
        count = 0
        for point1 in cluster1:
            for point2 in cluster2:
                total_distance += self.custom_distance(point1, point2)
                count += 1
        return total_distance / count
        
    # distance between centroids of 2 clusters
    def centroid_linkage(self,cluster1, cluster2):
        centroid1 = np.mean(cluster1, axis=0)
        centroid2 = np.mean(cluster2, axis=0)
        return self.custom_distance(centroid1, centroid2)

    def agglomerative_hierarchical_clustering(self, data, linkage):
        clusters = [[point] for point in data]

        while len(clusters) > 5:
            min_distance = float('inf')
            merge_indices = None

            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    distance = linkage(clusters[i], clusters[j])
                    if distance < min_distance:
                        min_distance = distance
                        merge_indices = (i, j)

            if merge_indices is not None:
                i, j = merge_indices
                clusters[i].extend(clusters[j])
                del clusters[j]

        # Create a dictionary to map data points (converted to tuples) to cluster labels
        labels = {}
        for i, cluster in enumerate(clusters):
            for point in cluster:
                labels[tuple(point)] = i

        # Convert the dictionary to a list of labels
        cluster_labels = [labels[tuple(point)] for point in data]

        return cluster_labels
    

In [None]:
agg = Agglomerative_scratch()

In [None]:
data_array = scaled_df.values

In [None]:
linkage_function = agg.single_linkage

In [None]:
data_array = data_array[:50,:]

In [None]:
custom_labels = agg.agglomerative_hierarchical_clustering(data_array, linkage_function)

In [None]:
from sklearn.metrics import silhouette_score

silhouette_score_sklearn = silhouette_score(scaled_df, clustering_average.labels_)
silhouette_score_custom = silhouette_score(scaled_df.iloc[:50,:], custom_labels)

print(f"Silhouette Score (scikit-learn): {silhouette_score_sklearn}")
print(f"Silhouette Score (custom): {silhouette_score_custom}")