# Clustering with a Gaussian Mixture Model

In [None]:
import pandas as pd

First we make our fictional dataset.

In [None]:
from sklearn.datasets import make_blobs

In [None]:
x, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

In [None]:
x[:10]

In [None]:
y

In [None]:
import matplotlib.pyplot as plt

In [None]:
ourcolors = ['red','blue','black','green','yellow','magenta','orange','brown','grey','aqua']

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in y])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
plt.scatter(x[:,0],
            x[:,1])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

We create an object for our model by calling "GaussianMixture" with specification for `n_components` (the number of distributions to find).

In [None]:
# kmeans = KMeans(n_clusters=4, n_init=10)
# dbscan = DBSCAN(eps=0.1, min_samples=5)
gm = GaussianMixture(n_components=4, n_init=10, random_state=42)

We then call the fit method, and pass in the data in which we want to search for clusters

In [None]:
# kmeans.fit(x)
# dbscan.fit(x)
gm.fit(x)

In [None]:
x[[0]]

In [None]:
# kmeans.predict(x[[0]])
# dbscan.predict(x[[0]])
gm.predict(x[[0]])

In [None]:
# the below will give an error!
# why?
plt.scatter(x[:,0],
            x[:,1],
            # color=[ourcolors[i] for i in kmeans.labels_])
            # color=[ourcolors[i] for i in dbscan.labels_])
            color=[ourcolors[i] for i in gm.labels_])

In [None]:
# kmeans.labels_
# dbscan.labels_
# gm.labels_

# error:
# no labels are predicted; we're trying to learn the gaussian distributions
gm.predict(x)

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in gm.predict(x)])

In [None]:
import ipywidgets

In [None]:
# def plotblobs(n):
    # kmeans = KMeans(n_clusters=n, n_init=10)
    # kmeans.fit(x)
    # plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
# def plotblobs(m,n):
#     dbscan = DBSCAN(eps=m, min_samples=n)
#     dbscan.fit(x)
#     plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])

def plotblobs(n):
    gm = GaussianMixture(n_components=n, n_init=10, random_state=42)
    gm.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in gm.predict(x)])

ipywidgets.interact(plotblobs,n=(1,10));

# Let's compare all of them on a different dataset

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

from sklearn.datasets import make_moons

In [None]:
x, y = make_moons(n_samples=1000, noise=0.1, random_state=42)

In [None]:
plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in y])

In [None]:
def plot_kmeans(n,m=0):
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=0)
    kmeans.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
def plot_dbscan(n,m):
    dbscan = DBSCAN(eps=m, min_samples=n)
    dbscan.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])
    for i in range(len(x[:,0])):
        if dbscan.labels_[i] == -1:
            plt.scatter(x[i,0],x[i,1],marker='x',color='red')

def plot_gm(n,m=0):
    gm = GaussianMixture(n_components=n, n_init=10, random_state=42)
    gm.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in gm.predict(x)])

def makeplot(method=plot_kmeans,n=1,m=0):
    method(n,m)
    return
    
ipywidgets.interact(makeplot,
                    method=[plot_kmeans,plot_dbscan,plot_gm],
                    n=(1,10),
                    m=(0.1,1.0));

# Book's data example:

In [None]:
import numpy as np

In [None]:
X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
x = np.r_[X1, X2]
y = np.r_[y1, y2]

In [None]:
plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in y])

In [None]:
def plot_kmeans(n,m=0):
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=0)
    kmeans.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])

# Note: we need to remember that "-1" is used for unclustered points
# so this color mapping is a little sloppy if number of clusters > 9
def plot_dbscan(n,m):
    dbscan = DBSCAN(eps=m, min_samples=n)
    dbscan.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in dbscan.labels_])
    for i in range(len(x[:,0])):
        if dbscan.labels_[i] == -1:
            plt.scatter(x[i,0],x[i,1],marker='x',color='red')

def plot_gm(n,m=0):
    gm = GaussianMixture(n_components=n, n_init=10, random_state=42)
    gm.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i % 10] for i in gm.predict(x)])

def makeplot(method=plot_kmeans,n=1,m=0):
    method(n,m)
    return
    
ipywidgets.interact(makeplot,
                    method=[plot_kmeans,plot_dbscan,plot_gm],
                    n=(1,10),
                    m=(0.1,1.0));