# Tugas Besar 2A - Pembelajaran Mesin


#### Anggota Kelompok
1. Ahmad Mutawalli - 13517026, K02
2. Harry Rahmadi Munly - 13517033, K03
3. Ardysatrio Fakhri Haroen - 13517062, K02
4. Dandi Agus Maulana - 13517077, K02

## Import Dependencies

In [152]:
import sys
import random
import copy
import time
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn import decomposition

## Implementasi modul K-means

In [396]:
class Kmeans():
    def __init__(self, cluster=4, max_iter=10):
        self.cluster = cluster
        self.max_iter = max_iter
        self.kmeans_centroids = []
    
    # Init centroids
    def init_centroid(self, data):
        num_data, num_feature = np.shape(data)
        centroids = np.zeros((self.cluster, num_feature))
        for i in range(self.cluster):
            centroid = data[np.random.choice(range(num_data))]
            centroids[i] = centroid
        return centroids

    #calculate euclidean distance
    def euclidean_distance (self, p1, p2):
        sum_square = 0
        for i in range(0, len(p1)):
            sum_square += math.pow(float(p1[i]) - float(p2[i]), 2)
        return float(math.sqrt(sum_square))

    # get closer centroid from a point
    def get_closest_centroid(self, point, centroids):
        index = -1
        min_dist = 999999.0
        for i, centroid in enumerate(centroids):
            distance = self.euclidean_distance(point, centroid)
            if distance < min_dist:
                index = i
                min_dist = distance
        return index

    # get clustering
    def assign_cluster(self, centroids, data):
        num_data = np.shape(data)[0]
        clusters = [[] for i in range(self.cluster)]
        for idx, value in enumerate(data):
            closest_centroid = self.get_closest_centroid(value, centroids)
            clusters[closest_centroid].append(idx)
        return clusters

    # make new centroids from the means of data in each cluster
    def make_new_centroids(self, clusters, data):
        num_feature = np.shape(data)[1]
        centroids = np.zeros((self.cluster, num_feature))
        for i, cluster in enumerate(clusters):
            centroid = np.mean(data[cluster], axis=0)
            centroids[i] = centroid
        return centroids

    # get label of data
    def get_cluster_labels(self, clusters, data):
        # One prediction for each sample
        label = np.zeros(np.shape(data)[0])
        for idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                label[sample_idx] = idx
        return label

    def check_change(self, data1, data2):
        change = False
        for i in range (len(data1)):
            if(not change):
                for j in range(len(data1[i])):
                    if (data1[i][j] != data2[i][j]):
                        change = True
                        break
        return change
    
    # run k-means clustering and return the centroids of the clusters
    def fit(self, data):
        # Initialize centroids
        centroids = self.init_centroid(data)

        # Iterate until convergence or for max iterations
        for i in range(self.max_iter):
            # Assign samples to closest centroids (create clusters)
            clusters = self.assign_cluster(centroids, data)

            prev_centroids = centroids
            # Calculate new centroids from the clusters
            centroids = self.make_new_centroids(clusters, data)

            # break if no change 
            if (not self.check_change(prev_centroids, centroids)):
                print("stop at iter :" + str(i))
                break

        self.kmeans_centroids = centroids
        return centroids

    # Predict the class of each sample
    def predict(self, data):
        
        clusters = self.assign_cluster(self.kmeans_centroids, data)

        predicted_labels = self.get_cluster_labels(clusters, data)

        return predicted_labels


## Clustering with K-means

In [397]:
#Load dataset iris
X_iris, y_iris = load_iris(return_X_y=True)
iris_data = pd.DataFrame(X_iris, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']); iris_data['label'] = y_iris
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [431]:
iris = load_iris()
train_data = np.array(iris.data)
train_labels = np.array(iris.target)
num_features = train_data.data.shape[1]

In [436]:
kmean = Kmeans(cluster=num_classes, max_iter=1000)

print("Centroid :")
centroids = kmean.fit(train_data)
print(centroids)

print("\n")
print("Predicted label :")
predicted_labels = kmean.predict(train_data)
print(predicted_labels)

Centroid :
stop at iter :13
[[5.006      3.428      1.462      0.246     ]
 [6.85384615 3.07692308 5.71538462 2.05384615]
 [5.88360656 2.74098361 4.38852459 1.43442623]]


Predicted label :
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 2. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 1. 2. 1. 1. 1. 1. 2. 1. 1. 1. 1. 1. 1. 2. 2. 1. 1. 1. 1. 2.
 1. 2. 1. 2. 1. 1. 2. 2. 1. 1. 1. 1. 1. 2. 1. 1. 1. 1. 2. 1. 1. 1. 2. 1.
 1. 1. 2. 1. 1. 2.]


In [437]:
# Calculate accuracy
Correct = 0
for index in range(len(train_labels)):
    current_label = train_labels[index]
    predicted_label = predicted_labels[index]
    if current_label == predicted_label:
        Correct += 1

Accuracy = Correct / len(train_labels)

print("K-Means Classification Accuracy = ", Accuracy)

K-Means Classification Accuracy =  0.44666666666666666
