# Clustering using K-Means


In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
warnings.filterwarnings('ignore', category=FutureWarning)

# Read the data
points_df = pd.read_csv("datasets/points.csv")
seeds_df = pd.read_csv("datasets/Grains/seeds.csv", header=None)
print(seeds_df.shape)
seeds_df.head(10)


### Plot the data before clustering

In [None]:
xs = points_df['X']
ys = points_df['Y']

plt.scatter(xs, ys)
plt.show()

### Build model

In [None]:
model = KMeans(n_clusters=4)
model.fit(points_df)
labels = model.predict(points_df)
print(labels)


### Plot the data after clustering

In [None]:
plt.scatter(xs, ys, c=labels, alpha=0.5)
plt.show()

### Plot the centroids

In [None]:
centroids = model.cluster_centers_
c_x = centroids[:,0]
c_y = centroids[:,1]
plt.scatter(xs, ys, c=labels, alpha=0.5)
plt.scatter(c_x, c_y, marker='D', s=50, c='r')
plt.show()

### Scaling >> Pipeline >> Cross Tabulation

In [None]:
# Create scaler: scaler
scaler = StandardScaler()

# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler,kmeans)

pipeline.fit(points_df)
labels2 = pipeline.predict(points_df)
ct = pd.crosstab(labels2, labels)
print(ct)


## Evalute a clustering using `inertia` <br>
(Sum of distances between points and centroid of a cluster)<br><br>
**Best K is the K at which there is an elbow in the plot**<br><br>
Best k in this example is 3

In [None]:
ks = range(1, 6)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters = k)
    
    # Fit model to samples
    model.fit(seeds_df)
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.plot(ks, inertias,"-o")
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
