In [1]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
import plotly.graph_objects as go
from ipywidgets import interact, interact_manual
import ipywidgets as widgets
from sqlalchemy import create_engine
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

### My KMeans algorithm

In [7]:
import math as m
def myKMeans(data,num_clusters, centroids):

# start by choosing ur number of clusters and the centroids to start with

    distances = np.zeros((len(data),num_clusters))
    num_clusters=num_clusters
    centroids=centroids
    #for Iter in range (0,20):
    for j in range(0,len(data)):
        for i in range(num_clusters):
            distances[j,i] = m.sqrt((centroids[i,0]-data[j,0])**2 + (centroids[i,1]-data[j,1])**2)
    
#assign points to centroids based on min distance
        
    clusters = distances.argmin(axis=1)

#calculate new centroids

    for cluster in set(clusters):
        x = data[clusters == cluster,0]
        y = data[clusters == cluster,1]
        centroids[cluster,:] = np.array([[x.mean(),y.mean()]])

    return centroids, clusters

Plotting clusters using kmeans

In [8]:
def plot_clusters(data,y_res, plt_cluster_centers = False):
    X_centroids = []
    Y_centroids = []

    for cluster in set(y_res):
        x = data[y_res == cluster,0]
        y = data[y_res == cluster,1]
        X_centroids.append(np.mean(x))
        Y_centroids.append(np.mean(y))

        plt.scatter(x,
                    y,
                    s=50,
                    marker='s',
                    label=f'cluster {cluster}')

    if plt_cluster_centers:
        plt.scatter(X_centroids,
                    Y_centroids,
                    marker='*',
                    c='red',
                    s=250,
                    label='centroids')
    plt.legend()
    plt.grid()
    plt.show()

Loading data:

In [13]:
df = pd.read_csv('dataScaled.csv')

In [14]:
df

Unnamed: 0,income,years_with_bank,nbr_children,marital_status,age,genderNum
0,-0.019996,0.473572,0.188672,-0.222860,0.117323,0.608120
1,-0.707273,-1.296386,-0.650356,0.312453,1.847837,0.967200
2,-1.115241,1.229831,-0.650356,-1.421611,-1.024885,-0.760130
3,1.490828,-0.179294,-0.650356,-1.031505,0.737388,-0.760130
4,-0.105140,0.817319,0.417239,0.332810,-0.971767,-0.760130
...,...,...,...,...,...,...
742,-1.045247,0.662244,0.104547,1.407876,-0.856232,-0.760130
743,-0.193385,0.682778,0.110611,-0.412636,-0.161207,0.480821
744,2.273419,-1.894226,1.221187,0.012333,0.462510,0.765884
745,-1.045247,1.088322,-0.650356,-0.427377,-0.856232,-0.760130


In [31]:
pca = PCA()
pca.fit(df)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
exp_var_cumul
px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "Num Components", "y": "Explained Variance"}
)

Based on the drop in explained variance, looks like 4-5 is the right number of clusters. But to view the results in 2d, we are going to use the first 2 components

In [28]:
features =  list(df.columns.values)
n_components = 2
pca = PCA(n_components)
components = pca.fit_transform(df[features])

In [29]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

In [30]:
loadings

array([[ 0.56185372, -0.06361293],
       [-0.95495648,  0.11777231],
       [ 0.36647262,  0.84330076],
       [ 0.75665066,  0.34480135],
       [ 0.6189644 , -0.5770508 ],
       [ 0.24284766, -0.26583875]])

Now with kmeans, we'll choose 4-5 clusters to see if the 2 PCs can explain them well (eventhough we already know that 2 PCs are not enough to capture the explained variance in the data)

In [33]:
@interact(x=(1, 30))
def getIterations(x):
    num_components=4
    centroids=components[np.random.choice(components.shape[0], num_components, replace=False), :]
    for Iter in range (0,x):
        fin_centroids, clusters = myKMeans(components,num_components,centroids)
    plot_clusters(components, clusters, plt_cluster_centers = True)

interactive(children=(IntSlider(value=15, description='x', max=30, min=1), Output()), _dom_classes=('widget-in…

In [32]:
@interact(x=(1, 30))
def getIterations(x):
    num_components=5
    centroids=components[np.random.choice(components.shape[0], num_components, replace=False), :]
    for Iter in range (0,x):
        fin_centroids, clusters = myKMeans(components,num_components,centroids)
    plot_clusters(components, clusters, plt_cluster_centers = True)

interactive(children=(IntSlider(value=15, description='x', max=30, min=1), Output()), _dom_classes=('widget-in…