<img src="UnSupervised.svg" />

# Finding the nearest neighbors

In [1]:
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])

from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors()
nbrs.fit(data)
new_obs = np.array([[5. , 3.5, 1.6, 0.3]])
dists, knbrs = nbrs.kneighbors(new_obs)

# nearest neighbors indexes
print('{}\n'.format(repr(knbrs)))
# nearest neighbor distances
print('{}\n'.format(repr(dists)))

only_nbrs = nbrs.kneighbors(new_obs,
                            return_distance=False)
print('{}\n'.format(repr(only_nbrs)))

array([[7, 4, 0, 6, 9]])

array([[0.17320508, 0.24494897, 0.24494897, 0.45825757, 0.46904158]])

array([[7, 4, 0, 6, 9]])



The default value for k when initializing the NearestNeighbors object is 5. We can specify a new value using the n_neighbors keyword argument.

In [5]:
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])

from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2)
nbrs.fit(data)
new_obs = np.array([
  [5. , 3.5, 1.6, 0.3],
  [4.8, 3.2, 1.5, 0.1]])
dists, knbrs = nbrs.kneighbors(new_obs)

# nearest neighbors indexes
print('{}\n'.format(repr(knbrs)))
# nearest neighbor distances
print('{}\n'.format(repr(dists)))

array([[7, 0],
       [9, 2]])

array([[0.17320508, 0.24494897],
       [0.14142136, 0.24494897]])



# Cluster Means / Centroid

In [6]:
cluster = np.array([
  [ 1.2, 0.6],
  [ 2.4, 0.8],
  [-1.6, 1.4],
  [ 0. , 1.2]])
print('Cluster:\n{}\n'.format(repr(cluster)))

centroid = cluster.mean(axis=0)
print('Centroid:\n{}\n'.format(repr(centroid)))

Cluster:
array([[ 1.2,  0.6],
       [ 2.4,  0.8],
       [-1.6,  1.4],
       [ 0. ,  1.2]])

Centroid:
array([0.5, 1. ])



# K-means clustering

In [48]:
from sklearn.cluster import KMeans
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])
kmeans = KMeans(n_clusters=3)
# predefined data
kmeans.fit(data)

# cluster assignments
print('{}\n'.format(repr(kmeans.labels_)))

# centroids
print('{}\n'.format(repr(kmeans.cluster_centers_)))

new_obs = np.array([
  [5.1, 3.2, 1.7, 1.9],
  [6.9, 3.2, 5.3, 2.2]])
# predict clusters
print('{}\n'.format(repr(kmeans.predict(new_obs))))

array([2, 1, 1, 1, 2, 0, 1, 2, 1, 1], dtype=int32)

array([[5.4       , 3.9       , 1.7       , 0.4       ],
       [4.68333333, 3.11666667, 1.41666667, 0.2       ],
       [5.03333333, 3.5       , 1.43333333, 0.2       ]])

array([0, 0], dtype=int32)



# K- Means Mini Batch Clustering

In [55]:
from sklearn.cluster import MiniBatchKMeans
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])
kmeans = MiniBatchKMeans(n_clusters=3, batch_size=10)
# predefined data
kmeans.fit(data)

# cluster assignments
print('{}\n'.format(repr(kmeans.labels_)))

# centroids
print('{}\n'.format(repr(kmeans.cluster_centers_)))

new_obs = np.array([
  [5.1, 3.2, 1.7, 1.9],
  [6.9, 3.2, 5.3, 2.2]])
# predict clusters
print('{}\n'.format(repr(kmeans.predict(new_obs))))

array([1, 2, 0, 0, 1, 1, 0, 1, 0, 2], dtype=int32)

array([[4.58493151, 3.17534247, 1.40410959, 0.23013699],
       [5.1037037 , 3.59135802, 1.47283951, 0.23703704],
       [4.9       , 3.04444444, 1.44444444, 0.15555556]])

array([1, 1], dtype=int32)



# Agglomerative clustering

In [52]:
from sklearn.cluster import AgglomerativeClustering
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])
agg = AgglomerativeClustering(n_clusters=3)
# predefined data
agg.fit(data)

# cluster assignments
print('{}\n'.format(repr(agg.labels_)))

array([1, 0, 0, 0, 1, 2, 0, 1, 0, 0])



Since agglomerative clustering doesn't make use of centroids, there's no cluster_centers_ attribute in the AgglomerativeClustering object. There's also no predict function for making cluster predictions on new data (since K-means clustering makes use of its final centroids for new data predictions).

# Mean Shift Clustering
Use mean shift clustering to determine the optimal number of clusters.

In [31]:
from sklearn.cluster import MeanShift
mean_shift = MeanShift()
# predefined data
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])
mean_shift.fit(data)

# cluster assignments
print('{}\n'.format(repr(mean_shift.labels_)))

# centroids
print('{}\n'.format(repr(mean_shift.cluster_centers_)))

new_obs = np.array([
  [5.1, 3.2, 1.7, 1.9],
  [6.9, 3.2, 5.3, 2.2]])
# predict clusters
print('{}\n'.format(repr(mean_shift.predict(new_obs))))

array([1, 0, 0, 0, 1, 2, 0, 1, 0, 0])

array([[4.74      , 3.16      , 1.42      , 0.2       ],
       [5.03333333, 3.5       , 1.43333333, 0.2       ],
       [5.4       , 3.9       , 1.7       , 0.4       ]])

array([2, 2])



# DBSCAN Clustering  -  Density-based spatial clustering of applications with noise 

In [24]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.2, min_samples=2)
data = np.array([
  [5.1, 3.5, 1.4, 0.2],
  [4.9, 3. , 1.4, 0.2],
  [4.7, 3.2, 1.3, 0.2],
  [4.6, 3.1, 1.5, 0.2],
  [5. , 3.6, 1.4, 0.2],
  [5.4, 3.9, 1.7, 0.4],
  [4.6, 3.4, 1.4, 0.3],
  [5. , 3.4, 1.5, 0.2],
  [4.4, 2.9, 1.4, 0.2],
  [4.9, 3.1, 1.5, 0.1]])
dbscan.fit(data)

# cluster assignments
print('{}\n'.format(repr(dbscan.labels_)))

# core samples
print('{}\n'.format(repr(dbscan.core_sample_indices_)))
num_core_samples = len(dbscan.core_sample_indices_)
print('Num core samples: {}\n'.format(num_core_samples))

array([ 0,  1, -1, -1,  0, -1, -1,  0, -1,  1])

array([0, 1, 4, 7, 9])

Num core samples: 5



# Cosine Similarities between pairs of observations

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
data = np.array([
  [ 1.1,  0.3],
  [ 2.1,  0.6],
  [-1.1, -0.4],
  [ 0. , -3.2]])
cos_sims = cosine_similarity(data)
print('{}\n'.format(repr(cos_sims)))

array([[ 1.        ,  0.99992743, -0.99659724, -0.26311741],
       [ 0.99992743,  1.        , -0.99751792, -0.27472113],
       [-0.99659724, -0.99751792,  1.        ,  0.34174306],
       [-0.26311741, -0.27472113,  0.34174306,  1.        ]])



# Cosine Similarities between two datasets

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
data = np.array([
  [ 1.1,  0.3],
  [ 2.1,  0.6],
  [-1.1, -0.4],
  [ 0. , -3.2]])
data2 = np.array([
  [ 1.7,  0.4],
  [ 4.2, 1.25],
  [-8.1,  1.2]])
cos_sims = cosine_similarity(data, data2)
print('{}\n'.format(repr(cos_sims)))

array([[ 0.9993819 ,  0.99973508, -0.91578821],
       [ 0.99888586,  0.99993982, -0.9108828 ],
       [-0.99308366, -0.9982304 ,  0.87956492],
       [-0.22903933, -0.28525359, -0.14654866]])



# L1 Norm

In [32]:
# vector L1 norm
from numpy import array
from numpy.linalg import norm
# define vector
a = array([1, 2, -3])
print(a)
# calculate norm
l1 = norm(a, 1)
print(l1)

[ 1  2 -3]
6.0


In [34]:
abs(1) + abs(2) + abs(-3)

6

# L2 Norm

In [33]:
# vector L2 norm
from numpy import array
from numpy.linalg import norm
# define vector
a = array([1, 2, -3])
print(a)
# calculate norm
l2 = norm(a)
print(l2)

[ 1  2 -3]
3.7416573867739413


In [56]:
import math
math.sqrt((1)**2 + (2)**2 + (-3)**2)

3.7416573867739413