# <font color='Blue'>Clustering case study data with mixed types using kmeans GOWER</font>

# <font color='Blue'>pyclustering</font>

Before you execute this code, please install pyclustering. It can be done through the following steps.

    First Run << conda update -n base -c defaults conda >> from Anaconda Shell
    Next Run << conda install -c conda-forge pyclustering >> from Anaconda Shell

    For more information you may visit
        https://anaconda.org/conda-forge/pyclustering
        https://pypi.org/project/pyclustering/

    Restart Kernel

# <font color='Blue'>Loading Libraries</font>

In [1]:
from pyclustering.cluster.kmeans import kmeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from pyclustering.utils.metric import distance_metric
from pyclustering.utils.metric import type_metric
from pyclustering.utils import calculate_distance_matrix

from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.samples.definitions import FCPS_SAMPLES
from pyclustering.utils import read_sample

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

## <font color='Blue'>1.0 Loading Data</font>

In [3]:
mktres = pd.read_csv( "mkt_res.csv" )
# Parking data in another dataframe
data = mktres
data.head(10)

Unnamed: 0,ID,Gender,Marital_Status,Work_Status,Education,Annual_Income,Age,Location,Purchasing_Decision_Maker,Purchasing_Location,Monthly_Electronics_Spend,Monthly_Household_Spend,Purchasing_Frequency,Technology_Adoption,Viewing_hours_day,Favorite_feature
0,1,male,married,professional,none,49,30,Florida,family,mass-consumer electronics,35,150,13,late,2,saving favorite shows to watch as a family
1,2,male,single,none,none,46,36,Alabama,single,mass-consumer electronics,35,163,26,late,10,saving favorite shows to watch as a family
2,3,male,married,professional,BA,58,66,Massachusetts,family,specialty stores,64,103,13,early,0,time shifting
3,4,male,married,none,PhD,51,78,New York,family,mass-consumer electronics,33,154,22,late,5,saving favorite shows to watch as a family
4,5,female,single,none,none,46,52,Montana,single,mass-consumer electronics,45,161,47,late,2,saving favorite shows to watch as a family
5,6,female,married,none,BA,31,72,New Jersey,single,retail,14,21,32,early,1,time shifting
6,7,male,married,professional,none,33,62,California,single,discount,18,40,41,early,0,cool gadget
7,8,male,married,none,none,29,30,New Hampshire,single,retail,23,75,9,early,1,schedule control
8,9,male,married,professional,none,57,60,Massachusetts,family,specialty stores,74,358,1,early,0,schedule control
9,10,female,married,professional,none,30,59,Idaho,family,discount,16,78,25,early,0,schedule control


## <font color='Blue'>1.1 Get the column names</font>

In [4]:
columns = list(data.columns) 
print(columns)
print("")

['ID', 'Gender', 'Marital_Status', 'Work_Status', 'Education', 'Annual_Income', 'Age', 'Location', 'Purchasing_Decision_Maker', 'Purchasing_Location', 'Monthly_Electronics_Spend', 'Monthly_Household_Spend', 'Purchasing_Frequency', 'Technology_Adoption', 'Viewing_hours_day', 'Favorite_feature']



## <font color='Blue'>1.2 Adding derived data</font>

In [5]:
data['Annual_Electronics_Spend'] = data['Monthly_Electronics_Spend']*12
data['Annual_Household_Spend']   = data['Monthly_Household_Spend']*12 
data['Electronic_Spend_Perc'] = (data['Annual_Electronics_Spend']/data['Annual_Household_Spend'])*100
data['Electronic_Spend_Perc'] = data['Electronic_Spend_Perc'].round(2)

# Number of raws and columns
print("#Rows and #Columns",data.shape)
print("")

#Rows and #Columns (1000, 19)



## <font color='Blue'>1.3 Drop columns not need for clustering</font>

In [6]:
data = data.drop(['ID','Location','Annual_Household_Spend','Monthly_Electronics_Spend','Monthly_Household_Spend'],axis=1)
print("#Rows and #Columns",data.shape)
print("")
columns = list(data.columns) 
print(columns)

#Rows and #Columns (1000, 14)

['Gender', 'Marital_Status', 'Work_Status', 'Education', 'Annual_Income', 'Age', 'Purchasing_Decision_Maker', 'Purchasing_Location', 'Purchasing_Frequency', 'Technology_Adoption', 'Viewing_hours_day', 'Favorite_feature', 'Annual_Electronics_Spend', 'Electronic_Spend_Perc']


## <font color='Blue'>1.4 Dummy Coding Variables</font>

In [7]:
dummy      = ['Gender', 'Marital_Status', 'Work_Status', 'Education','Purchasing_Decision_Maker','Purchasing_Location','Technology_Adoption','Favorite_feature']
dummydata  = pd.get_dummies(data, columns=dummy)
dummydata.head()

#Columns
print("#Rows and #Columns",dummydata.shape)
print("")
columns = list(dummydata.columns) 
print(columns)

#Rows and #Columns (1000, 30)

['Annual_Income', 'Age', 'Purchasing_Frequency', 'Viewing_hours_day', 'Annual_Electronics_Spend', 'Electronic_Spend_Perc', 'Gender_female', 'Gender_male', 'Marital_Status_married', 'Marital_Status_single', 'Work_Status_none', 'Work_Status_professional', 'Education_BA', 'Education_MA', 'Education_PhD', 'Education_none', 'Purchasing_Decision_Maker_family', 'Purchasing_Decision_Maker_single', 'Purchasing_Location_discount', 'Purchasing_Location_mass-consumer electronics', 'Purchasing_Location_retail', 'Purchasing_Location_specialty stores', 'Purchasing_Location_web (ebay)', 'Technology_Adoption_early', 'Technology_Adoption_late', 'Favorite_feature_cool gadget', 'Favorite_feature_programming/interactive features', 'Favorite_feature_saving favorite shows to watch as a family', 'Favorite_feature_schedule control', 'Favorite_feature_time shifting']


## <font color='Blue'>1.5 Examining Data</font>

In [8]:
dummydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Annual_Income                                                1000 non-null   int64  
 1   Age                                                          1000 non-null   int64  
 2   Purchasing_Frequency                                         1000 non-null   int64  
 3   Viewing_hours_day                                            1000 non-null   int64  
 4   Annual_Electronics_Spend                                     1000 non-null   int64  
 5   Electronic_Spend_Perc                                        1000 non-null   float64
 6   Gender_female                                                1000 non-null   uint8  
 7   Gender_male                                                  1000 non-null   ui

In [9]:
dummydata.head()

Unnamed: 0,Annual_Income,Age,Purchasing_Frequency,Viewing_hours_day,Annual_Electronics_Spend,Electronic_Spend_Perc,Gender_female,Gender_male,Marital_Status_married,Marital_Status_single,...,Purchasing_Location_retail,Purchasing_Location_specialty stores,Purchasing_Location_web (ebay),Technology_Adoption_early,Technology_Adoption_late,Favorite_feature_cool gadget,Favorite_feature_programming/interactive features,Favorite_feature_saving favorite shows to watch as a family,Favorite_feature_schedule control,Favorite_feature_time shifting
0,49,30,13,2,420,23.33,0,1,1,0,...,0,0,0,0,1,0,0,1,0,0
1,46,36,26,10,420,21.47,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,58,66,13,0,768,62.14,0,1,1,0,...,0,1,0,1,0,0,0,0,0,1
3,51,78,22,5,396,21.43,0,1,1,0,...,0,0,0,0,1,0,0,1,0,0
4,46,52,47,2,540,27.95,1,0,0,1,...,0,0,0,0,1,0,0,1,0,0


## <font color='Blue'>1.6 Normalizing Non-Categorical Variables</font>

In [10]:
scaler = StandardScaler()
num_var = ["Annual_Income","Age","Purchasing_Frequency","Viewing_hours_day","Annual_Electronics_Spend","Electronic_Spend_Perc"]
sample_n = scaler.fit_transform(dummydata[num_var])
sample_n = sample_n.round(2)
sample_n = sample_n.tolist()
print(type(sample_n))
print(sample_n)

<class 'list'>
[[0.32, -1.03, -0.72, -0.13, 0.24, -0.92], [0.23, -0.69, 0.22, 2.62, 0.24, -1.03], [0.61, 0.99, -0.72, -0.81, 1.94, 1.39], [0.39, 1.66, -0.07, 0.9, 0.12, -1.03], [0.23, 0.2, 1.73, -0.13, 0.82, -0.64], [-0.26, 1.32, 0.65, -0.47, -1.0, 1.65], [-0.19, 0.76, 1.3, -0.81, -0.76, 0.37], [-0.32, -1.03, -1.01, -0.47, -0.47, -0.48], [0.58, 0.65, -1.58, -0.81, 2.53, -1.07], [-0.29, 0.6, 0.15, -0.81, -0.88, -1.08], [-0.32, -1.31, -1.51, 1.25, 0.41, 1.12], [0.32, 0.48, -0.22, 0.9, 0.88, -1.09], [0.81, 1.77, -0.94, -0.47, 1.29, -1.08], [-0.39, -0.69, -1.08, -0.13, -0.88, -0.32], [-0.39, 0.71, 1.15, -0.47, -1.0, -0.92], [-0.19, 0.32, -0.5, -0.81, -1.0, 1.16], [-0.26, 0.54, -0.65, -0.47, -0.88, -0.14], [-0.36, -1.03, -1.08, -0.13, -1.0, -1.04], [-0.32, -1.31, -0.86, 0.9, -0.41, -0.89], [-0.36, 0.54, -0.5, -0.81, -0.82, -1.08], [0.35, 1.49, -0.43, 1.94, 0.94, -0.68], [-0.26, -0.08, -0.58, -0.47, -0.88, 2.44], [0.32, 0.2, 1.01, 0.9, 0.41, -0.96], [0.29, -0.75, 1.44, 1.94, 1.18, -1.05], [-

## <font color='Blue'>1.7 Retaining Categorical Variables As Such</font>

In [11]:
sample_c = dummydata
sample_c = sample_c.drop(num_var,axis=1)
print("#Rows and #Columns",sample_c.shape)
sample_c = sample_c.values.tolist()
print(type(sample_c))
sample_c

#Rows and #Columns (1000, 24)
<class 'list'>


[[0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
 [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
 [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
 [0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
 [1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
 [1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1],
 [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
 [0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
 [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
 [1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
 [1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
 [1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
 [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],
 [1, 0, 1, 0, 1, 0, 0, 0,

## <font color='Blue'>1.8 Combining Variables</font>

In [12]:
sample = np.concatenate((sample_n,sample_c),axis=1)
sample = sample.tolist()
print(type(sample))
print(len(sample))
print(sample)

<class 'list'>
1000
[[0.32, -1.03, -0.72, -0.13, 0.24, -0.92, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.23, -0.69, 0.22, 2.62, 0.24, -1.03, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.61, 0.99, -0.72, -0.81, 1.94, 1.39, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [0.39, 1.66, -0.07, 0.9, 0.12, -1.03, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.23, 0.2, 1.73, -0.13, 0.82, -0.64, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [-0.26, 1.32, 0.65, -0.47, -1.0, 1.65, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], [-0.19, 0.76, 1.3, 

## <font color='Blue'>2.0 Executing Clustering</font>

## <font color='Blue'>2.1 Prepare initial centers using K-Means++ method</font>

In [13]:
initial_centers = kmeans_plusplus_initializer(sample, 4).initialize()

## <font color='Blue'>2.2 create metric that will be used for clustering</font>

In [14]:
gower_metric = distance_metric(type_metric.GOWER,data=sample)

## <font color='Blue'>2.3 create instance of K-Means using specific distance metric</font>

In [15]:
kmeans_instance = kmeans(sample, initial_centers, metric=gower_metric)

## <font color='Blue'>2.4 Run cluster analysis and obtain results</font>

In [16]:
kmeans_instance.process()
clusters = kmeans_instance.get_clusters()

## <font color='Blue'>2.5 Show Allocated Clusters</font>

In [17]:
print(clusters)

[[10, 18, 21, 24, 30, 38, 40, 48, 53, 55, 56, 64, 65, 66, 76, 79, 88, 93, 97, 100, 111, 122, 131, 145, 148, 155, 156, 160, 173, 179, 184, 185, 190, 192, 193, 210, 218, 219, 227, 230, 237, 244, 250, 253, 255, 260, 261, 268, 293, 294, 295, 296, 299, 304, 308, 309, 318, 319, 322, 330, 334, 335, 336, 341, 346, 348, 349, 352, 362, 364, 375, 376, 378, 379, 382, 384, 390, 394, 401, 405, 406, 409, 410, 416, 425, 426, 428, 432, 447, 448, 452, 456, 461, 467, 470, 477, 488, 495, 500, 506, 510, 514, 517, 523, 524, 525, 528, 530, 531, 544, 550, 551, 555, 556, 558, 564, 566, 570, 576, 591, 594, 602, 608, 621, 628, 629, 631, 635, 640, 642, 654, 657, 678, 690, 695, 701, 707, 712, 717, 719, 728, 736, 737, 743, 745, 760, 761, 772, 784, 789, 790, 794, 796, 797, 799, 801, 805, 806, 807, 809, 813, 832, 833, 834, 836, 840, 841, 845, 857, 865, 866, 873, 874, 881, 884, 887, 889, 892, 896, 899, 901, 902, 905, 908, 911, 917, 921, 926, 931, 945, 950, 955, 958, 960, 966, 968, 978, 987, 992], [7, 8, 9, 13, 25, 27,

## <font color='Blue'>2.6 Adding the cluster lables to dataframe df for analysis</font>

In [18]:
df=data
df['clusterid'] = ''
for x in df.index.values:
    if x in clusters[0]:
       df['clusterid'][x] = 0
    elif x in clusters[1]:
       df['clusterid'][x] = 1
    elif x in clusters[2]:
       df['clusterid'][x] = 2
    else: 
       df['clusterid'][x] = 3

cluster_size = df.groupby(['clusterid']).size() 
print(cluster_size)

clusterid
0    199
1    207
2    394
3    200
dtype: int64


## <font color='Blue'>2.7 Performance Measure: Silhouette Score</font>

In [19]:
print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(sample, df['clusterid']))
# Silhouette score between -1 and 1

Silhouette Coefficient: 0.113


## <font color='Blue'>2.8 Performance Measure: Calinski-Harabasz</font>

In [20]:
print("Calinski-Harabasz index: %0.3f"% metrics.calinski_harabasz_score(sample, df['clusterid']))

Calinski-Harabasz index: 101.974


## <font color='Blue'>3.0 Examining Chararcteristics</font>

In [21]:
values=['Annual_Income','Age','Purchasing_Frequency','Viewing_hours_day','Annual_Electronics_Spend','Electronic_Spend_Perc']
index =['clusterid']
aggfunc={'Annual_Income': np.mean,
         'Age': np.mean,
         'Purchasing_Frequency':np.mean,
         'Viewing_hours_day':np.mean,
         'Annual_Electronics_Spend':np.mean,
         'Electronic_Spend_Perc':np.mean}
result = pd.pivot_table(data,values=values,
                             index =index,
                             aggfunc=aggfunc,
                             fill_value=0)
result['cluster_size'] = cluster_size
result = result.round(2)
result

Unnamed: 0_level_0,Age,Annual_Electronics_Spend,Annual_Income,Electronic_Spend_Perc,Purchasing_Frequency,Viewing_hours_day,cluster_size
clusterid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,33.78,434.83,34.9,40.37,21.14,2.4,199
1,52.63,307.19,38.84,38.31,20.25,1.06,207
2,50.83,309.78,36.56,38.46,21.83,1.04,394
3,53.5,496.86,48.1,38.29,29.94,6.32,200


## <font color='Blue'>3.1 Examining Chararcteristics - Cont'd </font>

In [22]:
dummydata['clusterid'] = df['clusterid']
values=['Gender_female','Gender_male','Marital_Status_married','Marital_Status_single','Work_Status_none',
        'Work_Status_professional']
index =['clusterid']
aggfunc={'Gender_female': np.mean,
         'Gender_male': np.mean,
         'Marital_Status_married':np.mean,
         'Marital_Status_single':np.mean,
         'Work_Status_none':np.mean,
         'Work_Status_professional':np.mean}
result = pd.pivot_table(dummydata,values=values,
                             index =index,
                             aggfunc=aggfunc,
                             fill_value=0)
result['cluster_size'] = cluster_size
result = result.round(2)
result

Unnamed: 0_level_0,Gender_female,Gender_male,Marital_Status_married,Marital_Status_single,Work_Status_none,Work_Status_professional,cluster_size
clusterid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.53,0.47,0.05,0.95,0.06,0.94,199
1,0.44,0.56,0.83,0.17,0.39,0.61,207
2,0.41,0.59,0.91,0.09,0.4,0.6,394
3,0.53,0.47,0.9,0.1,0.5,0.5,200


## <font color='Blue'>3.2 Examining Chararcteristics - Cont'd </font>

In [23]:
dummydata['clusterid'] = df['clusterid']
values=['Education_BA','Education_MA','Education_PhD','Education_none','Purchasing_Decision_Maker_family',
        'Purchasing_Decision_Maker_single']
index =['clusterid']
aggfunc={'Education_BA': np.mean,
         'Education_MA': np.mean,
         'Education_PhD':np.mean,
         'Education_none':np.mean,
         'Purchasing_Decision_Maker_family':np.mean,
         'Purchasing_Decision_Maker_single':np.mean}
result = pd.pivot_table(dummydata,values=values,
                             index =index,
                             aggfunc=aggfunc,
                             fill_value=0)
result['cluster_size'] = cluster_size
result = result.round(2)
result

Unnamed: 0_level_0,Education_BA,Education_MA,Education_PhD,Education_none,Purchasing_Decision_Maker_family,Purchasing_Decision_Maker_single,cluster_size
clusterid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.31,0.21,0.19,0.3,0.02,0.98,199
1,0.24,0.05,0.02,0.69,0.63,0.37,207
2,0.25,0.06,0.05,0.65,0.63,0.37,394
3,0.25,0.25,0.25,0.25,0.9,0.1,200


## <font color='Blue'>3.3 Examining Chararcteristics - Cont'd </font>

In [24]:
dummydata['clusterid'] = df['clusterid']
values=['Technology_Adoption_early','Technology_Adoption_late']
index =['clusterid']
aggfunc={'Technology_Adoption_early': np.mean,
         'Technology_Adoption_late': np.mean}
result = pd.pivot_table(dummydata,values=values,
                             index =index,
                             aggfunc=aggfunc,
                             fill_value=0)
result['cluster_size'] = cluster_size
result = result.round(2)
result

Unnamed: 0_level_0,Technology_Adoption_early,Technology_Adoption_late,cluster_size
clusterid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,199
1,1,0,207
2,1,0,394
3,0,1,200


## <font color='Blue'>3.4 Examining Chararcteristics - Cont'd </font>

In [25]:
dummydata['clusterid'] = df['clusterid']
values=['Favorite_feature_cool gadget','Favorite_feature_programming/interactive features','Favorite_feature_saving favorite shows to watch as a family',
        'Favorite_feature_schedule control','Favorite_feature_time shifting']
index =['clusterid']
aggfunc={'Favorite_feature_cool gadget': np.mean,
         'Favorite_feature_programming/interactive features': np.mean,
         'Favorite_feature_saving favorite shows to watch as a family':np.mean,
         'Favorite_feature_schedule control':np.mean,
         'Favorite_feature_time shifting':np.mean}
result = pd.pivot_table(dummydata,values=values,
                             index =index,
                             aggfunc=aggfunc,
                             fill_value=0)
result['cluster_size'] = cluster_size
result = result.round(2)
result

Unnamed: 0_level_0,Favorite_feature_cool gadget,Favorite_feature_programming/interactive features,Favorite_feature_saving favorite shows to watch as a family,Favorite_feature_schedule control,Favorite_feature_time shifting,cluster_size
clusterid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.12,0.65,0,0.07,0.16,199
1,0.0,0.0,0,1.0,0.0,207
2,0.52,0.0,0,0.0,0.48,394
3,0.0,0.0,1,0.0,0.0,200
