In [22]:
import numpy as np 
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

In [23]:
%matplotlib notebook

In [24]:
from sklearn.datasets import make_classification
Xc_2,_= make_classification(n_samples=200,
                                    n_features=2,
                                    n_informative=2,
                                    n_redundant=0,
                                    random_state=0,
                                    n_clusters_per_class=1,
                                    class_sep = 0.8)
plt.figure()
plt.scatter  (Xc_2[:,0], Xc_2[:,1])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1b349bac3d0>

In [25]:
def initialize_centroids(X, k):
    indices = np.random.choice(X.shape[0], k, replace=False)
    return X[indices]

In [26]:
def assign_clusters(X, centroids):
    distances = cdist(X, centroids, 'euclidean')
    return np.argmin(distances, axis=1)

In [27]:
def compute_centroids(X, clusters, k):
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        centroids[i] = np.mean(X[clusters == i], axis=0)
    return centroids

In [28]:
def compute_cost(X, clusters, centroids):
    cost = 0
    for i in range(len(X)):
        cost += np.sum((X[i] - centroids[clusters[i]])**2)
    return cost

In [29]:
def k_means(X, k, num_iterations=100):
    best_cost = np.inf
    best_centroids = None
    best_clusters = None
    
    for i in range(num_iterations):
        centroids = initialize_centroids(X, k)
        for i in range(300):
            clusters = assign_clusters(X, centroids)
            new_centroids = compute_centroids(X, clusters, k)
            if np.all(centroids == new_centroids):
                break
            centroids = new_centroids
        
        cost = compute_cost(X, clusters, centroids)
        if cost < best_cost:
            best_cost = cost
            best_centroids = centroids
            best_clusters = clusters

    return best_clusters, best_centroids


In [30]:
k = 4
clusters, centroids = k_means(Xc_2, k)

plt.figure(figsize=(10, 6))
plt.scatter(Xc_2[:, 0], Xc_2[:, 1], c=clusters, cmap='viridis', marker='o')
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=100)
plt.title("K-means Clustering with k = 4")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

<IPython.core.display.Javascript object>