clustering for mixed data types - https://www.youtube.com/watch?v=8eATPLDJ0NQ

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import sklearn

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import scale

import kmodes
from kmodes.kprototypes import KPrototypes

import sklearn.metrics as sm
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report

In [39]:
marketing_df = pd.read_csv('https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/marketing_cva_f.csv')
marketing_df.to_csv('clustering tutorial_data.csv')

In [10]:
marketing_df = marketing_df.drop(['Customer', 'Vehicle_Class', 'avg_vehicle_age', 'months_last_claim', 'Total_Claim_Amount'], axis = 1)

In [11]:
marketing_df

Unnamed: 0,State,CLV,Coverage,Income,loc_type,monthly_premium,Months_Since_Policy_Inception
0,Washington,2763.519279,Basic,56274,Suburban,69,5
1,Nevada,12887.431650,Premium,48767,Suburban,108,38
2,Washington,2813.692575,Basic,43836,Rural,73,44
3,Oregon,8256.297800,Basic,62902,Rural,69,94
4,Oregon,5380.898636,Basic,55350,Suburban,67,13
...,...,...,...,...,...,...,...
6812,California,5032.165498,Basic,66367,Suburban,64,48
6813,California,4100.398533,Premium,47761,Suburban,104,58
6814,California,23405.987980,Basic,71941,Urban,73,89
6815,California,3096.511217,Extended,21604,Suburban,79,28


In [12]:
mark_array = marketing_df.values

In [13]:
mark_array

array([['Washington', 2763.519279, 'Basic', ..., 'Suburban', 69, 5],
       ['Nevada', 12887.43165, 'Premium', ..., 'Suburban', 108, 38],
       ['Washington', 2813.692575, 'Basic', ..., 'Rural', 73, 44],
       ...,
       ['California', 23405.98798, 'Basic', ..., 'Urban', 73, 89],
       ['California', 3096.511217, 'Extended', ..., 'Suburban', 79, 28],
       ['California', 7524.442436, 'Extended', ..., 'Suburban', 96, 3]],
      dtype=object)

In [14]:
mark_array[:, 1] = mark_array[:, 1].astype(float)
mark_array[:, 3] = mark_array[:, 3].astype(float)
mark_array[:, 5] = mark_array[:, 5].astype(float)
mark_array[:, 6] = mark_array[:, 6].astype(float)

In [15]:
mark_array

array([['Washington', 2763.519279, 'Basic', ..., 'Suburban', 69.0, 5.0],
       ['Nevada', 12887.43165, 'Premium', ..., 'Suburban', 108.0, 38.0],
       ['Washington', 2813.692575, 'Basic', ..., 'Rural', 73.0, 44.0],
       ...,
       ['California', 23405.98798, 'Basic', ..., 'Urban', 73.0, 89.0],
       ['California', 3096.511217, 'Extended', ..., 'Suburban', 79.0,
        28.0],
       ['California', 7524.442436, 'Extended', ..., 'Suburban', 96.0,
        3.0]], dtype=object)

In [17]:
kproto = KPrototypes(n_clusters = 3, verbose = 2, max_iter = 20)
clusters = kproto.fit_predict(mark_array, categorical = [0, 2, 4])

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/20, moves: 294, ncost: 786191417664.713
Run: 1, iteration: 2/20, moves: 58, ncost: 785662723324.5215
Run: 1, iteration: 3/20, moves: 27, ncost: 785566620155.1418
Run: 1, iteration: 4/20, moves: 10, ncost: 785551468522.3026
Run: 1, iteration: 5/20, moves: 6, ncost: 785546422486.777
Run: 1, iteration: 6/20, moves: 2, ncost: 785545634415.1552
Run: 1, iteration: 7/20, moves: 0, ncost: 785545634415.1552
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/20, moves: 373, ncost: 792947402881.1442
Run: 2, iteration: 2/20, moves: 191, ncost: 787645043356.4779
Run: 2, iteration: 3/20, moves: 86, ncost: 786447784903.568
Run: 2, iteration: 4/20, moves: 59, ncost: 785945968414.2338
Run: 2, iteration: 5/20, moves: 45, ncost: 785708436066.9501
Run: 2, iteration: 6/20, moves: 20, ncost: 785655117685.776
Run: 2, iteration: 7/20, moves: 15, ncost: 78561396

In [18]:
print(kproto.cluster_centroids_)

[array([[8.04338174e+03, 8.29341174e+04, 9.09228225e+01, 4.76527012e+01],
       [8.33216831e+03, 5.46449257e+04, 9.51337668e+01, 4.91597771e+01],
       [8.03295458e+03, 2.66208754e+04, 9.26217544e+01, 4.69445614e+01]]), array([['California', 'Basic', 'Rural'],
       ['California', 'Basic', 'Suburban'],
       ['California', 'Basic', 'Suburban']], dtype='<U10')]


In [20]:
len(clusters)

6817

In [21]:
len(marketing_df)

6817

In [23]:
clusters_list = list(clusters)

In [25]:
marketing_df['cluster'] = clusters_list

In [36]:
marketing_df[marketing_df['cluster']==0]

Unnamed: 0,State,CLV,Coverage,Income,loc_type,monthly_premium,Months_Since_Policy_Inception,cluster
7,California,8798.797003,Premium,77026,Urban,110,82,0
8,Arizona,8819.018934,Basic,99845,Suburban,110,25,0
9,California,5384.431665,Basic,83689,Urban,70,10,0
19,Oregon,5802.065978,Basic,97541,Suburban,72,1,0
21,Arizona,12902.560140,Premium,86584,Suburban,111,54,0
...,...,...,...,...,...,...,...,...
6790,California,5926.385440,Basic,92949,Urban,74,84,0
6792,California,7083.642205,Premium,97024,Urban,177,68,0
6797,California,2619.337376,Basic,78618,Urban,66,56,0
6802,California,2845.520933,Basic,86631,Suburban,73,44,0


In [37]:
marketing_df[marketing_df['cluster']==1]

Unnamed: 0,State,CLV,Coverage,Income,loc_type,monthly_premium,Months_Since_Policy_Inception,cluster
0,Washington,2763.519279,Basic,56274,Suburban,69,5,1
1,Nevada,12887.431650,Premium,48767,Suburban,108,38,1
2,Washington,2813.692575,Basic,43836,Rural,73,44,1
3,Oregon,8256.297800,Basic,62902,Rural,69,94,1
4,Oregon,5380.898636,Basic,55350,Suburban,67,13,1
...,...,...,...,...,...,...,...,...
6808,California,9424.256842,Basic,46897,Urban,118,11,1
6809,California,5479.555081,Basic,56005,Suburban,68,30,1
6811,California,16261.585500,Extended,60646,Suburban,134,42,1
6812,California,5032.165498,Basic,66367,Suburban,64,48,1


In [38]:
marketing_df[marketing_df['cluster']==2]

Unnamed: 0,State,CLV,Coverage,Income,loc_type,monthly_premium,Months_Since_Policy_Inception,cluster
5,Oregon,24127.504020,Basic,14072,Suburban,71,3,2
6,Oregon,7388.178085,Extended,28812,Urban,93,7,2
10,Oregon,7463.139377,Basic,24599,Rural,64,50,2
11,Nevada,2566.867823,Basic,25049,Suburban,67,7,2
12,California,3945.241604,Basic,28855,Suburban,101,59,2
...,...,...,...,...,...,...,...,...
6805,California,7501.661322,Extended,38874,Urban,94,86,2
6806,California,5133.397765,Basic,28647,Suburban,69,59,2
6810,California,25464.820590,Extended,13663,Suburban,97,66,2
6815,California,3096.511217,Extended,21604,Suburban,79,28,2
