# CustomKMeans Class

In [1]:
import numpy as np


class CustomKMeans:
    def __init__(self, k, max_iter=300, tol=0.001):
        self.k = k # Number of clusters
        self.max_iter = max_iter # Maximum number of iterations for KMeans
        self.tol = tol # Tolerance level to stop iterations
        self.centroids = None # Used to store centroids
        self.labels = None # Used to store the cluster assignments

    #Fits the KMeans model to the data
    def fit(self, df):
        self.dataframe = df.to_numpy()
        n_samples, n_features = self.dataframe.shape

        # Randomly assign 'k' number of centroids from the data points
        self.centroids = self.dataframe[np.random.choice(n_samples, self.k, replace=False)]

        # Iterate for 'max_iter' number of times
        for i in range(self.max_iter):
            # Assigns each data point to the nearest centroid
            self.labels = self._euclidean(self.dataframe)
            # Calculate new centroids based on the current assignment clusters
            new_centroids = self._calc_Centroids(self.dataframe)
            # Check if the centroids have moved less than the tolerance value
            if np.all(np.abs(new_centroids - self.centroids) < self.tol):
                break
            self.centroids = new_centroids

    # Predicts the cluster of each data point in a dataframe
    def predict(self, df):
        # Convert dataframe to NumPy array
        dataframe = df.to_numpy()
        if self.centroids is None:
            raise ValueError("Model has not been fitted yet")
        # Assigns the new points to the closest centroid
        return self._euclidean(dataframe)

    # Method that returns the dissimilarity or Sum of Squared Errors for the clustering solution
    def sse(self):
        if self.labels is None or self.centroids is None:
            raise ValueError("Model has not been fitted yet")
        sse = 0
        # Iterate over each cluster and its centroid
        for i, centroid in enumerate(self.centroids):
            cluster_points = np.where(self.labels == i)[0]
            # Compute the squared differences between points and the centroid and get their sum
            sse += np.sum(np.square(self.dataframe[cluster_points] - centroid))

        return sse

    # Helper method to assign each data point to the nearest centroid based on Euclidean distance
    def _euclidean(self, df):
        #Calculates the Euclidean distance from each point to the centroid
        distances = np.array([np.linalg.norm(df - centroid, axis=1) for centroid in self.centroids])
        #Assigns each data point to the nearest centroid
        return np.argmin(distances, axis=0)

    # Helper method to calculate the new centroids as the mean of points assigned to each cluster
    def _calc_Centroids(self, df):
        # Initialize an array to store the new centroids
        centroids = np.zeros((self.k, df.shape[1]))
        # Iterate over each cluster
        for j in range(self.k):
            # Accumulate all the points that are assigned to the cluster
            cluster_points = df[self.labels == j]
            # If there are points in the cluster, calculate their mean to create the new centroid
            if len(cluster_points) > 0:
                centroids[j] = cluster_points.mean(axis=0)
        return centroids


# Clustering using CustomKMeans Class

In [2]:
# Imports

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.cluster import KMeans

In [3]:
df = pd.read_csv("../data/cleaned/cleaned_data.csv")

In [4]:
# 1000 Random Entries from Price and Product_ID
sampled_prices = df['price'].sample(n=1000, replace=False).reset_index(drop=True)
sampled_product_ids = df['product_id'].sample(n=1000, replace=False).reset_index(drop=True)

# Create a a Sampled Dataframe
sampled_data = pd.DataFrame({
    'price': sampled_prices,
    'product_id': sampled_product_ids
})

In [None]:
#### Custom KMeans (c_kmeans) implementation


def plot_clusters(sampled_data, kmeans):
    plt.figure(figsize=(12, 6))
    plt.scatter(sampled_data['price'], sampled_data['product_id'], c=sampled_data['cluster'], cmap='viridis', alpha=0.6)
    plt.xlabel('Price')
    plt.ylabel('Product ID')
    plt.title('K-Means Clustering of Sampled Prices and Product IDs')
    plt.colorbar(label='Cluster')
    plt.show()



for i in range(1,10):
    #Use the KMeans on the sampled data frame we just made (x1 = price, x2 = product_id)
    kmeans = CustomKMeans(k=i)

    #Finds best centroids
    kmeans.fit(sampled_data)

    #Predict what data is best around said clusters
    sampled_data['cluster'] = kmeans.predict(sampled_data)
    
    plot_clusters(sampled_data, kmeans)

# Clustering using SkLearn 

In [None]:
#Compare it to the SkLearn version, lets use k = 5

# Run custom k-means over the clusters dataset with 5 clusters
#Use the KMeans on the sampled data frame we just made (x1 = price, x2 = product_id)
kmeans = CustomKMeans(k=5)

#Finds best centroids
kmeans.fit(sampled_data)

#Predict what data is best around said clusters
sampled_data['cluster'] = kmeans.predict(sampled_data)
    
plot_clusters(sampled_data, kmeans)

print("Now doing sklearn version")
# Run k-means over the clusters dataset with 5 clusters
km = KMeans(n_clusters=5)
clusters = km.fit(sampled_data)
km.cluster_centers_
#labels = (km.labels_) # Array of all labels
plot_clusters(sampled_data, km)

# Create a line plot showing the inertia (SSE) for each n_clusters value
inertia = []


for (i) in range(1,15):
  km = KMeans(n_clusters=i)
  clusters = km.fit(sampled_data)
  inertia.append(km.inertia_)

plt.plot(inertia)
plt.show()
