# Kmeans Clustering implementation from scratch
# author: Alireza Meydani / Me ofcourse :D
# Std no: 4023082511


In [87]:
import numpy as np
import pandas as pd

In [88]:
def k_means(X, k, max_iters=100):
    '''
    takes the data in form of numpy representation of
    (preferebly pandas,) your dataframe and 
    amount of clusters that you want, then starts
    the clustering process and gives two argument;
    declared labels, and final position / state of
    centroids.
    '''
    centroids = X[np.random.choice(X.shape[0], k, replace=False)]

    for _ in range(max_iters):
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return labels, centroids

In [89]:
def categorical_to_numerical(df):
    '''
    First makes sure that all columns are in
    form of category, then changes/converts them
    into numerical (int-8) data types based on 
    different categories that one column can be
    '''
    for col in df.columns:
        # print(col)
        df[col] = df[col].astype("category")
        df[col] = pd.Categorical(df[col]).codes

def get_accuracy(y_predicted,y_true):
    acc = np.sum(y_true==y_predicted)/len(y_true)*100
    return acc
def get_confusion_matrix(y_test , y_pred):
    assert_mat = (y_test == y_pred)
    tp,tn,fp,fn=np.zeros(4)
    for idx, val in enumerate(assert_mat):
        if val :
            if y_pred[idx]:
                tp+=1
                continue
            tn+=1
            continue
        if y_pred[idx]:
            fp+=1
            continue
        fn+=1
    return np.array((tp,fp,fn,tn))
    


## Load dataset from CSV and data transformation
### and Cleanup process...

In [90]:

df = pd.read_csv("./share/BCD/Breast_Cancer_dataset.csv")
categorical_to_numerical(df)
actual_results = df["Class"]
df = df.drop(columns="Class")
# df= df.head(50)
# numpy Representation of df
X = df.values


# Add cluster labels to the DataFrame



# Perform k-means clustering

In [91]:

# the number of clusters (k) / declared Centroids
k = 2

labels, centroids = k_means(X, k,200)
df["Cluster"] = labels


### The Results:

In [92]:
print("Centroids:\n", centroids)



Centroids:
 [[2.66842105 1.11052632 3.25263158 0.04210526 0.08421053 0.93684211
  0.45263158 1.75789474 0.13157895]
 [2.65625    1.         5.66666667 3.11458333 0.58333333 1.27083333
  0.5        1.86458333 0.44791667]]


In [93]:
print("Updated DataFrame with Cluster labels:\n")
df

Updated DataFrame with Cluster labels:



Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Cluster
0,2,2,2,0,1,2,1,2,0,0
1,3,0,2,0,0,0,1,0,0,0
2,3,0,6,0,0,1,0,1,0,0
3,2,2,6,0,1,2,1,1,1,0
4,2,2,5,4,1,1,0,4,0,1
...,...,...,...,...,...,...,...,...,...,...
281,3,0,5,5,1,1,0,1,0,1
282,3,2,4,4,1,1,0,1,1,1
283,1,2,5,5,1,1,1,4,0,1
284,3,2,2,0,0,1,1,1,0,0


# confusion_matrix

In [94]:
tp,fp,fn,tn=get_confusion_matrix(actual_results,df["Cluster"])
get_confusion_matrix(actual_results,df["Cluster"]).reshape(2,2)

array([[ 43.,  53.],
       [ 42., 148.]])

## Error, Recall , Precision

In [95]:
accuracy=get_accuracy(df["Cluster"],actual_results)
error = 100 - accuracy
recall = tp/(tp+fn)*100
precision = tp/(tp+fp)*100
print(f"Current Accuracy:\n\t%{accuracy}")
# 1 - Accuracy``
print(f"Error ratio:\n\t%{error}")
# sensitivity out of real cases
print(f"Recall:\n\t%{recall}")
# out of all predictions
print(f"Precision:\n\t%{precision}")

Current Accuracy:
	%66.78321678321679
Error ratio:
	%33.21678321678321
Recall:
	%50.588235294117645
Precision:
	%44.79166666666667
