<a href="https://colab.research.google.com/github/anu-shree-anil/Machine-learning-Algorithms/blob/main/k_medoids_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import math
from statistics import mean
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import silhouette_samples, silhouette_score


In [7]:
#loading the data
wine=datasets.load_wine()
print(wine.DESCR)
df=pd.DataFrame(data=np.c_[wine['data'],wine['target']],columns=wine['feature_names']+['target'])
X=pd.DataFrame(wine.data)
print("ORIGINAL DATA:")
print(df)

#normalize the dataset
for column in X.columns:
    X[column] = (X[column] - X[column].min()) / (X[column].max() - X[column].min()) 

df=X.copy()

print("Dataset without the labels: \n",df)
df=df.to_numpy()

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [8]:
def centroids(dataset,k):

  number_of_rows = dataset.shape[0]
  random_indices = np.random.choice(number_of_rows, size=k, replace=False)
  centroids = dataset[random_indices, :]

  return centroids 

def euclidean(x1,x2):
    square = np.square(x1 - x2)
    # Get the sum of the square
    sum_square = np.sum(square)
    distance = np.sqrt(sum_square)

    return distance

def sse_score(dataset,distance_data,cluster):
   sse=0
   for j in range(len(cluster)):
      error=0
      for i in range(len(distance_data)):
          if distance_data[i][-1]==j+1:
            error=error+euclidean(dataset[i],cluster[j])
      sse=sse+error      

   return sse

def distance_chart(dataset,cluster):
    distance_data=[]
    for i in range(len(dataset)):
      l=[]
      for j in range(len(cluster)):
        l.append(euclidean(dataset[i],cluster[j]))
    
      l.append(l.index(min(l))+1)
      distance_data.append(l)
    
    return distance_data

def labels(dataset,cluster):
    
    label=[]
    for i in range(len(dataset)):
      l=[]
      for j in range(len(cluster)):
        l.append(euclidean(dataset[i],cluster[j]))
    
      label.append(l.index(min(l))+1)
    
    return label

def k_medoid(dataset,k,epochs):
  epoch=0
  cluster=np.array(centroids(dataset,k))
  converged=True
  distance_data=[]
  while converged:
    
    distance_data=distance_chart(dataset,cluster)
    sse_old=sse_score(dataset,distance_data,cluster)
    
    #updating the clusters
    new_cluster=[]
    #changing the first cluster
    number_of_rows = dataset.shape[0]
    random_index = np.random.choice(number_of_rows, size=1, replace=False)
    medoid = dataset[random_index]
    while (medoid==cluster).all():
      random_index = np.random.choice(number_of_rows, size=1, replace=False)
      medoid = dataset[random_index]
    
    new_cluster=cluster.copy()
    #dist=[]
    #for j in range(k):
      #dist.append(euclidean(medoid,cluster[j]))

    new_cluster[random.randint(0,k-1)]=np.array(medoid)
    #calculating sse for new clusters
    distance_data_new=distance_chart(dataset,new_cluster)
    sse_new=sse_score(dataset,distance_data_new,new_cluster)
    
    if((new_cluster==cluster).all() or epoch>epochs):
        converged=False

    #calculating swap score
    swp=sse_new-sse_old
    if swp<0:
      for j in range(k):
        cluster[j]=new_cluster[j].copy()

      distance_data=distance_data_new.copy()
 
    cluster=np.array(cluster)
    new_cluster=np.array(new_cluster)
    epoch=epoch+1  
    
    
  return sse_score(dataset,distance_data,cluster), cluster

sse=k_medoid(df,3,300)
print("SSE: ",sse[0])
cluster=sse[1]
label=labels(df,cluster)
silhouette_avg = silhouette_score(df,label)
print("Silhouette Score: ",silhouette_avg)


SSE:  124.65483555361087
Silhouette Score:  0.2773953186092372


In [9]:
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=3)
Kmean.fit(df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [10]:
print(Kmean.inertia_)
cluster_labels = Kmean.fit_predict(df)
silhouette_avg = silhouette_score(df, cluster_labels)
print("Silhouette Score: ",silhouette_avg)

48.97029115513917
Silhouette Score:  0.3008938518500134
