In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

from matplotlib import pyplot as plt
import pytz

from sklearn_extra.cluster import KMedoids

from datetime import timedelta

from sklearn.preprocessing import MinMaxScaler

import networkx as nx
from scipy.sparse import csr_matrix, lil_matrix

from time import time



from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

from tqdm import tqdm

from scipy.spatial.distance import pdist, squareform


import random

In [None]:
!pip install scikit-learn-extra

Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading scikit_learn_extra-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.3.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/new_data_preprocessed.csv')

In [None]:
dataset = dataset.drop(["Charge_Point_ID"], axis = 1)

In [None]:
def kmedoids_plus_plus(X, k):
    np.random.seed(42)
    n_samples = X.shape[0]
    medoids = X[np.random.choice(n_samples, k, replace=False)]
    return medoids

In [None]:
def kmedoids(X, initial_medoids, max_iters=300, tol=1e-4):
    n_samples = X.shape[0]
    k = initial_medoids.shape[0]
    medoids = initial_medoids
    prev_medoids = medoids.copy()
    cluster_assignments = np.zeros(n_samples)

    for _ in range(max_iters):
        # Assign clusters
        distances = np.linalg.norm(X[:, np.newaxis] - medoids, axis=2)
        cluster_assignments = np.argmin(distances, axis=1)

        # Update medoids
        for i in range(k):
            points_in_cluster = X[cluster_assignments == i]
            if len(points_in_cluster) > 0:
                medoid_idx = np.argmin(np.sum(np.linalg.norm(points_in_cluster[:, np.newaxis] - points_in_cluster, axis=2), axis=1))
                medoids[i] = points_in_cluster[medoid_idx]

        # Check for convergence
        medoid_shifts = np.linalg.norm(medoids - prev_medoids, axis=1)
        if np.all(medoid_shifts < tol):
            break
        prev_medoids = medoids.copy()

    return medoids, cluster_assignments

In [None]:
X = np.array(dataset[:30000])

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def grid_search_kmedoids(X, k_range):
    silhouette_scores = []
    for k in k_range:
        initial_medoids = kmedoids_plus_plus(X, k)
        #final_medoids, cluster_assignments = kmedoids(X, initial_medoids)

        kmedoid = KMedoids(n_clusters=k, method='pam', random_state=42)
        cluster_assignments = kmedoid.fit_predict(X)

        # Count the number of elements in each cluster
        cluster_counts = Counter(cluster_assignments)

        # Print the number of elements in each cluster
        for cluster_id, count in sorted(cluster_counts.items()):
            print(f"Cluster {cluster_id}: {count} elements")

        score = silhouette_score(X, cluster_assignments)
        silhouette_scores.append(score)
        print(f"Silhouette score for k={k}: {score}")

    plt.figure(figsize=(10, 6))
    plt.plot(k_range, silhouette_scores, marker='o')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score for different values of k (K-Medoids)')
    plt.grid(True)
    plt.show()

    return silhouette_scores

# Define the range of k values
k_range = range(2, 13)

# Assuming X is your dataset loaded as a numpy array
# Example: X = np.array([...])

# Perform grid search and plot silhouette scores
silhouette_scores = grid_search_kmedoids(X, k_range)

Cluster 0: 20949 elements
Cluster 1: 9051 elements
Silhouette score for k=2: 0.3510269742339459
Cluster 0: 10378 elements
Cluster 1: 11460 elements
Cluster 2: 8162 elements
Silhouette score for k=3: 0.27266856179169885
Cluster 0: 7920 elements
Cluster 1: 9605 elements
Cluster 2: 9167 elements
Cluster 3: 3308 elements
Silhouette score for k=4: 0.29623948567305874
Cluster 0: 7797 elements
Cluster 1: 7149 elements
Cluster 2: 2924 elements
Cluster 3: 6240 elements
Cluster 4: 5890 elements
Silhouette score for k=5: 0.2007652861359552
Cluster 0: 4392 elements
Cluster 1: 7193 elements
Cluster 2: 5793 elements
Cluster 3: 2955 elements
Cluster 4: 5890 elements
Cluster 5: 3777 elements
Silhouette score for k=6: 0.21530760206457233
Cluster 0: 6429 elements
Cluster 1: 5792 elements
Cluster 2: 2927 elements
Cluster 3: 3283 elements
Cluster 4: 5695 elements
Cluster 5: 2725 elements
Cluster 6: 3149 elements
Silhouette score for k=7: 0.2373259128133865
Cluster 0: 944 elements
Cluster 1: 6251 elements
