In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from os import path
path = '/content/drive/MyDrive/Colab Notebooks/csv/housing.csv'
df = pd.read_csv(path)

df.describe()

In [None]:
df_new = df[["longitude","latitude"]] #menslicing data yang akan dii analisis
df_new.head

In [None]:
df_new.describe()

In [None]:
df_new.isnull().sum()# mencari apakah ada data yg null

In [None]:
df.plot(kind='scatter', x='longitude', y='latitude')
#plt.scatter (df_new.iloc[:,0], df_new.iloc[:,1])
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.title('Housing')
plt.show()

In [None]:
#membaca 2 features
df = df[["longitude","latitude"]]
df.head()

In [None]:
#asumsi nilai k
kmeans = KMeans(n_clusters=3)#membuat objek k-means
kmeans.fit(df)#Fitting the Model

In [None]:
#generate label tiap point
labels = kmeans.predict(df)
labels

In [None]:
#menampilkan data centroid tiap cluster
centroids = kmeans.cluster_centers_
centroids

In [None]:
#melihat inersia k-means
kmeans.inertia_

In [None]:
inertia_list = []
for num_clusters in np.arange(1, 21):
    kmeans =KMeans(n_clusters=num_clusters)
    kmeans.fit(df)
    inertia_list.append(kmeans.inertia_)

In [None]:
inertia_list

In [None]:
#Plotting menggunakan elbow curve
plt.figure(figsize=(10, 5))
plt.plot(np.arange(1, 21), inertia_list, color='blue')
plt.scatter(np.arange(1, 21), inertia_list, color='red')
plt.grid(True)
plt.xlabel('Jumlah Cluster')
plt.ylabel('Nilai Inertia')
plt.title('Elbow Curve')
plt.show()

In [None]:
#step 1
k = 3 #Asumsikan kita akan mencari 3 cluster
np.random.seed(40)

#disini kita akan mendapatkan 3 poit random centroid
random_centroids=[]
for i in range(1,k+1):
    random_centroids.append([np.random.randint(-124,-114), np.random.randint(32,42)])
print('Randomly selected point as random_centroids:',random_centroids)

In [None]:
#ploting random centroid tiap segmen
plt.figure(figsize=(5, 5))
plt.scatter(df_new.iloc[:,0], df_new.iloc[:,1], color='cyan')
length = len(random_centroids)
colors1=['r','g','b','cyan','yellow','black']
for i in range(length):
    plt.scatter(*random_centroids[i],color=colors1[i])
plt.xlabel('longitude')    
plt.ylabel('latitude')
plt.show()

In [None]:
#Langkah kedua

#Here, we are calculating the distance between the datapoints and our centroids and assigning each data-point to the closest cluster
def assignment(df_new,random_centroids):

  for i in range(length) :
    df_new['Distance from Centroid {}'.format(i)]=(np.sqrt((df_new.iloc[:,0] - random_centroids[i][0]) ** 2 + (df_new.iloc[:,1] - random_centroids[i][1]) ** 2))
   
  list1=[]
  list2=[]
  for a,b,c in zip(df_new['Distance from Centroid 0'],df_new['Distance from Centroid 1'],df_new['Distance from Centroid 2']):
    d = min(a,b,c)
   
    if d == a:
      list1.append(0)
      list2.append('r')
    elif d == b:
      list1.append(1)
      list2.append('g')
    else:
      list1.append(2)
      list2.append('b')
  df_new['Closest_Centroid'] = list1
  df_new['Color']=list2
  return df_new

In [None]:
df_new = assignment(df_new, random_centroids)
df_new.head()

In [None]:
#penempatan cluster
plt.figure(figsize=(5,5))
plt.scatter(df_new.iloc[:,0],df_new.iloc[:,1],color =df_new['Color'],alpha = 0.5,edgecolor = 'k')
for i in range(length):
  plt.scatter(*random_centroids[i],color=colors1[i])

In [None]:
#step-3
#update data
#update centroid titik yang baru
#update stage
def update(parameter):
    for i in range(length):
        random_centroids[i][0] = np.mean(df_new[df_new['Closest_Centroid'] == i]['longitude'])
        random_centroids[i][1] = np.mean(df_new[df_new['Closest_Centroid'] == i]['latitude'])
    return parameter

In [None]:
#step last
#mencari titik centroid paling tepat
df_new = assignment(df_new, random_centroids)
df_new.head()
plt.figure(figsize=(5, 5))
plt.scatter(df_new.iloc[:,0], df_new.iloc[:,1], color=df_new['Color'], alpha=0.5, edgecolor='k')
for i in range(length):
    plt.scatter(*random_centroids[i], color=colors1[i])
plt.show()

In [None]:
#membaca 2 features
df = df[["longitude","latitude"]]
df.head()

In [None]:
#asumsi nilai k
kmeans = KMeans(n_clusters=3)#membuat objek k-means
kmeans.fit(df)#Fitting the Model

In [None]:
#generate label tiap point
labels = kmeans.predict(df)
labels

In [None]:
#menampilkan data centroid tiap cluster
centroids = kmeans.cluster_centers_
centroids

In [None]:
#melihat inersia k-means
kmeans.inertia_

In [None]:
plt.figure(figsize=(10, 5))
colmap = {1:'y',2:'g',3:'b',4:'r',5:'c'} #penentuan warna plot
colors = map(lambda x: colmap[x+1], labels)
print(colors)
colors1=list(colors)
plt.scatter(df['longitude'], df['latitude'], color=colors1, alpha=0.5)
for idx, centroid in enumerate(centroids):
    plt.scatter(*centroid, color=colmap[idx+1])
plt.xlabel('Usia')
plt.ylabel('Pendapatan Skor')
plt.title('Plot ter-clusterisasi K-Means')
plt.show()