In [42]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
from skopt import gp_minimize

In [2]:
import numpy as np

# Make data

In [100]:
data = np.array([[1,2,np.nan],[1,1,0],[2,3,1],[3,2,1],[3,2,np.nan],
                [5,6,0],[1,3,1],[1,3,1],[1,3,1],[1,3,1],[1,3,1],[1,3,1],[1,3,1],
                [1,3,1],[1,3,1],[1,3,1],[1,3,1],[1,3,1],[1,3,1],[1,3,1]]) #encoding with labels attached

In [101]:
data_train, data_test, labels_train, labels_test=train_test_split(data[:,:-1], data[:,-1], test_size=0.3, random_state=42)

In [102]:
labels_train

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., nan,  1.,  1.,  1.,  1.,
        1.])

In [103]:
labels_test

array([nan,  1.,  1.,  0.,  1.,  0.])

# K means scratch

In [104]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(data_train)
kmeans.labels_
#array([1, 1, 1, 0, 0, 0], dtype=int32)

  """Entry point for launching an IPython kernel.


array([0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int32)

In [105]:
preds_test=kmeans.predict(data_test)
preds_test

array([0, 0, 0, 0, 0, 2], dtype=int32)

In [106]:
labels_test

array([nan,  1.,  1.,  0.,  1.,  0.])

In [107]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [108]:
def clean(y_true, y_pred):
    nan_inds = np.argwhere(np.isnan(y_true))
    y_true = np.array([x for i,x in enumerate(y_true) if i not in nan_inds])
    y_pred = np.array([x for i,x in enumerate(y_pred) if i not in nan_inds])
    return y_true,y_pred

In [109]:
clean(labels_test,preds_test)

(array([1., 1., 0., 1., 0.]), array([0, 0, 0, 0, 2], dtype=int32))

In [110]:
clean1,clean2=clean(labels_test,preds_test)
purity_score(clean1,clean2)

0.8

In [118]:
def objective(params):
    k = params[0]
    kmeans = KMeans(n_clusters=k, random_state=0).fit(data_train)
    preds_test=kmeans.predict(data_test)
    clean1,clean2=clean(labels_test,preds_test)
    score = purity_score(clean1,clean2)
    print('Purity is %s when k is %s' %(score,k))
    return score

# Test purity function

In [120]:
ex = np.array([[0,20,30],[0,20,5],[25,0,0]])
ex

array([[ 0, 20, 30],
       [ 0, 20,  5],
       [25,  0,  0]])

In [121]:
np.sum(np.amax(ex, axis=0)) / np.sum(ex) 

0.75

# Do HPOpt

In [119]:
for k in [2,3,4]:
    objective([k])

Purity is 0.8 when k is 2
Purity is 0.8 when k is 3
Purity is 0.8 when k is 4


  This is separate from the ipykernel package so we can avoid doing imports until
