In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from pandas import DataFrame
import operator
import matplotlib.cm as cm
import math

In [2]:
x = pd.read_csv('data_cleaned_avg_cluster2.csv')
x = x.drop(['date', 'Address',], axis=1)

X_after, test_daily = train_test_split(x, test_size=0.2, random_state=123)
# test.to_csv(r'data_test_daily_c.csv', index = False, header=True)

X_after, test = train_test_split(X_after, test_size=0.125, random_state=123)
# test.to_csv(r'data_test_final_c.csv', index = False, header=True)

colors = 10*[
    'green', 'red', 'cyan', 'blue', 'black', 'yellow', 'magenta'
]

In [3]:
X_after.head()

Unnamed: 0,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5
14832,37.584848,127.094023,0.00487,0.014913,0.02087,0.3,46.652174,13.26087
5935,37.504547,126.994458,0.003083,0.035292,0.007167,0.520833,51.666667,39.125
7379,37.658774,127.068505,0.005133,0.043467,0.0046,0.853333,31.066667,25.133333
11247,37.609823,126.934848,0.004083,0.029042,0.019375,0.745833,32.291667,18.458333
11104,37.609823,126.934848,0.002412,0.012471,0.045,0.335294,25.705882,11.647059


In [4]:
class K_means(object):

    def train(self, data, k, verbose=0):

        shape = data.shape
        
        #initialize new array of zero untuk centroid
        ranges = np.zeros((shape[1], 6))
        centroids = np.zeros((shape[1], 6))

        for dim in range(shape[1]):
            ranges[dim, 0] = np.min(data[:,dim])
            ranges[dim, 1] = np.max(data[:,dim])

        if verbose == 1:
            print('Ranges: ')
            print(ranges)

        #change array centroid shape sesuai cluster
        centroids = np.zeros((k, shape[1]))
        for i in range(k):
            for dim in range(shape[1]):
                centroids[i, dim] = np.random.uniform(ranges[dim, 0], ranges[dim, 1], 1)

        if verbose == 1:
            print('Centroids: ')
            print(centroids)

            plt.scatter(data[:,0], data[:,1])
            plt.scatter(centroids[:,0], centroids[:,1], c = 'r')
            plt.show()

        count = 0
        while True:
            count += 1
            if verbose == 1:
                print('-----------------------------------------------')
                print('Iteration: ', count)

            #hitung jarak
            distances = np.zeros((shape[0],k))
            for ix, i in enumerate(data):
                for ic, c in enumerate(centroids):
                    distances[ix, ic] = np.sqrt(np.sum((i-c)**2))

            #assign ke centroid terdekat
            labels = np.argmin(distances, axis = 1)

            #calculate posisi centroid baru
            new_centroids = np.zeros((k, shape[1]))
            for centroid in range(k):
                temp = data[labels == centroid]
                if len(temp) == 0:
                    return 0
                for dim in range(shape[1]): 
                    new_centroids[centroid, dim] = np.mean(temp[:,dim])

            if verbose == 1:
                plt.scatter(data[:,0], data[:,1], c = labels)
                plt.scatter(new_centroids[:,0], new_centroids[:,1], c = 'r')
                plt.show()

            #Cek apakah perbedaan posisi centroid sudah lebih kecil dari epsilon
            if np.linalg.norm(new_centroids - centroids) < np.finfo(float).eps:
                print("DONE!")
                plt.scatter(data[:,0], data[:,1], c = labels)
                plt.scatter(new_centroids[:,0], new_centroids[:,1], c = 'r')
                plt.show()
                break

        #Move centroid ke titik baru
            centroids = new_centroids
        self.centroids = centroids
        self.labels = labels
        if verbose == 1:
            print(labels)
            print(centroids)
        return 1

    def getAverageDistance(self, data):

        dists = np.zeros((len(self.centroids),))
        for ix, centroid in enumerate(self.centroids):
            temp = data[self.labels == ix]
            dist = 0
            for i in temp:
                dist += np.linalg.norm(i - centroid)
            dists[ix] = math.sqrt(dist)
        return dists

    def getLabels(self):
        return self.labels

In [5]:
x2 = X_after[['Longitude','Latitude', 'O3','SO2', 'NO2','CO']].copy()
x2.head()

Unnamed: 0,Longitude,Latitude,O3,SO2,NO2,CO
14832,127.094023,37.584848,0.02087,0.00487,0.014913,0.3
5935,126.994458,37.504547,0.007167,0.003083,0.035292,0.520833
7379,127.068505,37.658774,0.0046,0.005133,0.043467,0.853333
11247,126.934848,37.609823,0.019375,0.004083,0.029042,0.745833
11104,126.934848,37.609823,0.045,0.002412,0.012471,0.335294


In [6]:
x_array = np.array(x2)
x_array

array([[1.27094023e+02, 3.75848485e+01, 2.08695652e-02, 4.86956522e-03,
        1.49130435e-02, 3.00000000e-01],
       [1.26994458e+02, 3.75045471e+01, 7.16666667e-03, 3.08333333e-03,
        3.52916667e-02, 5.20833333e-01],
       [1.27068505e+02, 3.76587743e+01, 4.60000000e-03, 5.13333333e-03,
        4.34666667e-02, 8.53333333e-01],
       ...,
       [1.27011952e+02, 3.76479299e+01, 3.43043478e-02, 3.47826087e-03,
        7.47826087e-03, 3.34782609e-01],
       [1.26949679e+02, 3.75937421e+01, 2.08947368e-02, 2.63157895e-03,
        2.03157895e-02, 6.89473684e-01],
       [1.27027279e+02, 3.76067189e+01, 2.52500000e-02, 4.91666667e-03,
        1.61666667e-02, 4.00000000e-01]])

In [None]:
clf = K_means()
clf.train(x_array,3)

In [None]:
from sklearn import preprocessing
x_scaled = preprocessing.scale(x_array)
x_scaled

In [None]:
list_distance = []
for x in range(1,11):
    clf.train(x_scaled,x)
    list_distance.append(clf.getAverageDistance(x_scaled)) 
    
print(list_distance)

In [None]:
list_distance2 = []
for x in range (0,len(list_distance)):
    if len(list_distance[x]) > 1:
        avg = sum(list_distance[x]) / len(list_distance[x]) 
        list_distance2.append(avg)
    else:
        avg = sum(list_distance[x])
        list_distance2.append(avg)
print(list_distance2)

In [None]:
number_of_clusters = range(1,11)
plt.plot(number_of_clusters,list_distance2, marker='o')
plt.xlabel('NUMBER OF CLUSTERS')
plt.ylabel('WCSS value')
plt.show

Analisa : 
1. Warna Ungu daerah dengan polusi lebih tinggi
<br>
2. Warna Kuning daerah dengan polusi lebih rendah

In [None]:
x3 = test_daily[['Longitude','Latitude', 'O3','SO2', 'NO2','CO']].copy()
x3.head()

In [None]:
x_test = np.array(x3)
x_test

In [None]:
from sklearn import preprocessing
x_scaled = preprocessing.scale(x_test)
x_scaled

In [None]:
clf = K_means()
clf.train(x_scaled,2)