In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [2]:
customersdata = pd.read_csv("customers-data.csv")

In [3]:
customersdata.head()

Unnamed: 0,customer_id,products_purchased,complains,money_spent
0,1000661,1,0,260.0
1,1001914,1,0,790.2
2,1002167,3,0,234.2
3,1002387,1,0,890.0
4,1002419,2,0,103.0


In [4]:
customersdata.describe()

Unnamed: 0,customer_id,products_purchased,complains,money_spent
count,24000.0,24000.0,24000.0,24000.0
mean,1001908.0,1.6,0.01,456.83
std,648.9202,0.800017,0.117901,320.704042
min,1000660.0,1.0,0.0,102.0
25%,1001913.0,1.0,0.0,233.7
50%,1002166.0,1.0,0.0,260.5
75%,1002387.0,2.0,0.0,793.2
max,1002419.0,3.0,2.0,894.0


In [5]:
# Define K-means model
kmeans_model = KMeans(init='k-means++',  max_iter=400, random_state=42)

In [6]:
# Train the model
kmeans_model.fit(customersdata[['products_purchased','complains',
'money_spent']])



In [7]:
# Create the K means model for different values of K
def try_different_clusters(K, data):

    cluster_values = list(range(1, K+1))
    inertias=[]

    for c in cluster_values:
        model = KMeans(n_clusters = c,init='k-means++',max_iter=400,random_state=42)
        model.fit(data)
        inertias.append(model.inertia_)

    return inertias

In [8]:
# Find output for k values between 1 to 12
outputs = try_different_clusters(12, customersdata[['products_purchased','complains','money_spent']])
distances = pd.DataFrame({"clusters": list(range(1, 13)),"sum of squared distances": outputs})



In [9]:
# Finding optimal number of clusters k
figure = go.Figure()
figure.add_trace(go.Scatter(x=distances["clusters"], y=distances["sum of squared distances"]))

figure.update_layout(xaxis = dict(tick0 = 1,dtick = 1,tickmode = 'linear'),
                  xaxis_title="Number of clusters",
                  yaxis_title="Sum of squared distances",
                  title_text="Finding optimal number of clusters using elbow method")
figure.show()

In [12]:
# Re-Train K means model with k=5
kmeans_model_new = KMeans(n_clusters = 5,init='k-means++',max_iter=400,random_state=42)

kmeans_model_new.fit_predict(customersdata[['products_purchased','complains','money_spent']])





array([3, 1, 4, ..., 4, 2, 0], dtype=int32)

In [13]:
# Create data arrays
cluster_centers = kmeans_model_new.cluster_centers_
data = np.expm1(cluster_centers)
points = np.append(data, cluster_centers, axis=1)
points


overflow encountered in expm1



array([[ 6.38905610e+000,  5.01252086e-003,  1.14301649e+045,
         2.00000000e+000,  5.00000000e-003,  1.03750000e+002],
       [ 1.71828183e+000,  1.00501671e-002,              inf,
         1.00000000e+000,  1.00000000e-002,  7.92700000e+002],
       [ 1.71828183e+000,  2.02013400e-002,              inf,
         1.00000000e+000,  2.00000000e-002,  8.91750000e+002],
       [ 1.71828183e+000,  1.51130646e-002,  2.24315755e+113,
         1.00000000e+000,  1.50000000e-002,  2.61000000e+002],
       [ 1.90855369e+001, -3.97251676e-016,  1.09015568e+102,
         3.00000000e+000, -3.97251676e-016,  2.34950000e+002]])

In [14]:
# Add "clusters" to customers data
points = np.append(points, [[0], [1], [2], [3], [4]], axis=1)
customersdata["clusters"] = kmeans_model_new.labels_

In [15]:
customersdata.head()

Unnamed: 0,customer_id,products_purchased,complains,money_spent,clusters
0,1000661,1,0,260.0,3
1,1001914,1,0,790.2,1
2,1002167,3,0,234.2,4
3,1002387,1,0,890.0,2
4,1002419,2,0,103.0,0


In [17]:
# visualize clusters
figure = px.scatter_3d(customersdata,
                    color='clusters',
                    x="products_purchased",
                    y="complains",
                    z="money_spent",
                    category_orders = {"clusters": ["0", "1", "2", "3", "4"]}
                    )
figure.update_layout()
figure.show()