In [52]:
import pandas as pd #Uploading csv files etc.
import numpy as np #For matrix aritmethics
from sklearn.cluster import KMeans #For Kmeans algorithm
from sklearn.metrics import mean_squared_error #For calculating the accuracy

In [53]:
#Put the csv files in the same file with the code
breast_data = pd.read_csv('breast_data.csv')
breast_truth = pd.read_csv('breast_truth.csv')

In [54]:
#We checked number of rows and columns of breast_data
breast_data.shape

(568, 30)

In [55]:
#We check number of rows and columns of breast_truth
breast_truth.shape

(568, 1)

In [56]:
#K is a hyperparameter(we decide it's value)
#In our problem there are 2 clusters: benign and malign
kmean=KMeans(n_clusters=2)
kmean.fit(breast_data)

KMeans(n_clusters=2)

In [57]:
#We can see our centers
kmean.cluster_centers_

array([[1.25562991e+01, 1.85703653e+01, 8.11234703e+01, 4.96061872e+02,
        9.48844977e-02, 9.10998174e-02, 6.24377642e-02, 3.34325434e-02,
        1.78057991e-01, 6.34540183e-02, 3.04190868e-01, 1.21515320e+00,
        2.15288059e+00, 2.37852922e+01, 7.17326256e-03, 2.34746895e-02,
        2.87455128e-02, 1.06363242e-02, 2.06135799e-02, 3.74750297e-03,
        1.40439018e+01, 2.47095434e+01, 9.19375114e+01, 6.19647945e+02,
        1.29959110e-01, 2.23311758e-01, 2.19214947e-01, 9.13298425e-02,
        2.83553653e-01, 8.32819406e-02],
       [1.93906154e+01, 2.17816154e+01, 1.28273077e+02, 1.18735231e+03,
        1.01163000e-01, 1.47620769e-01, 1.75992077e-01, 1.00341846e-01,
        1.91152308e-01, 6.04636154e-02, 7.40094615e-01, 1.22497846e+00,
        5.22490000e+00, 9.52341538e+01, 6.60022308e-03, 3.20469769e-02,
        4.23327692e-02, 1.56724769e-02, 2.02291538e-02, 3.93616154e-03,
        2.36966154e+01, 2.90017692e+01, 1.58295385e+02, 1.75097692e+03,
        1.40257231e-01,

In [58]:
#We can check labels created
breast_predict=kmean.labels_
breast_predict

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [63]:
#Accuracy is not commonly used in unsupervised algorithms altough
#We can check the accuracy by using MSE(mean squared error)
#Result is very low as expected (%14)
#Kmeans gets better by iteration(also called lazy learner because of it's this feature)
print('%',100*mean_squared_error(breast_predict,breast_truth))

% 14.612676056338028


In [62]:
#Running our algorithm several times starting with different centers
#The result got better(%85) because kmeans learns slow to not overfit data
for i in range(1,21):
    #kmean=KMeans(n_clusters=2) 
    kmean.fit(breast_data)
    breast_predict=kmean.labels_
    print('Iteration',i,':   %',100*mean_squared_error(breast_predict,breast_truth))

Iteration 1 :   % 85.38732394366197
Iteration 2 :   % 14.612676056338028
Iteration 3 :   % 14.612676056338028
Iteration 4 :   % 85.38732394366197
Iteration 5 :   % 85.38732394366197
Iteration 6 :   % 85.38732394366197
Iteration 7 :   % 14.612676056338028
Iteration 8 :   % 14.612676056338028
Iteration 9 :   % 14.612676056338028
Iteration 10 :   % 14.612676056338028
Iteration 11 :   % 14.612676056338028
Iteration 12 :   % 14.612676056338028
Iteration 13 :   % 14.612676056338028
Iteration 14 :   % 14.612676056338028
Iteration 15 :   % 85.38732394366197
Iteration 16 :   % 14.612676056338028
Iteration 17 :   % 85.38732394366197
Iteration 18 :   % 14.612676056338028
Iteration 19 :   % 14.612676056338028
Iteration 20 :   % 14.612676056338028
