In [136]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

In [147]:
test_df = pd.read_pickle('test_df.pkl')

In [148]:
test_df.head()

Unnamed: 0,is_customizable,materials,num_favorers,occasion,processing_max,processing_min,quantity,tags,taxonomy_path,views,when_made,who_made,price_usd
151842,0,"bezels, nickel free bezels, Americana cabochon...",8.0,,3.0,1.0,4.0,"Fourth of July studs, patriotic accessorie, pa...","Jewelry, Earrings, Stud Earrings",514.0,standard,i_did,8.0
261700,0,"turquoise, sterling silver",11.0,,1.0,1.0,1.0,,"Jewelry, Rings, Solitaire Rings",34.0,vintage,someone_else,49.75
264540,0,Stainless steel,8.0,,1.0,1.0,1.0,"Over Lap Tear Drop, Stainless Tear Drops, VTG ...","Jewelry, Rings, Statement Rings",49.0,vintage,someone_else,24.99
144297,1,"Silver, Polyester, Brass",5.0,,2.0,1.0,20.0,"bunny brooch, discobunny, rabbit brooch, disco...","Jewelry, Brooches, Pins & Clips, Brooches",25.0,standard,i_did,6.027
252974,1,"sterling silver, silk",46.0,,10.0,5.0,2.0,"necklace, pendant, sterling silver, silver, bu...","Jewelry, Necklaces, Pendants",359.0,made_to_order,i_did,67.32


### One Hot Encode for Materials

In [149]:
#Silver - silver, sterling silver, 
#Gold
#Glass
#Diamond
#Stones - stone, gemstone, turquoise,quartz,amethyst,emrald,ruby
#pearl


test_df['contains_silver'] = test_df.materials.str.contains('silver', case = False, regex=False).apply(lambda x:1 if x else 0)
test_df['contains_gold'] = test_df.materials.str.contains('gold', case = False, regex=False).apply(lambda x:1 if x else 0)
test_df['contains_glass'] = test_df.materials.str.contains('glass', case = False, regex=False).apply(lambda x:1 if x else 0)
test_df['contains_diamond'] = test_df.materials.str.contains('diamond', case = False, regex=False).apply(lambda x:1 if x else 0)
test_df['contains_pearl'] = test_df.materials.str.contains('pearl', case = False, regex=False).apply(lambda x:1 if x else 0)
test_df['contains_stone'] = \
(test_df.materials.str.contains('quartz', case = False, regex=False) | \
 test_df.materials.str.contains('ruby', case = False, regex=False) | \
 test_df.materials.str.contains('emrald', case = False, regex=False) | \
 test_df.materials.str.contains('turquoise', case = False, regex=False) | \
 test_df.materials.str.contains('amethyst', case = False, regex=False) | \
 test_df.materials.str.contains('stone', case = False, regex=False) | \
 test_df.materials.str.contains('gemstone', case = False, regex=False)
).apply(lambda x:1 if x else 0)


In [151]:
#Can drop materials column now

test_df = test_df.drop(columns=['materials'])

In [156]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62209 entries, 151842 to 15653
Data columns (total 18 columns):
is_customizable     62209 non-null int64
num_favorers        62209 non-null float64
occasion            62209 non-null object
processing_max      62209 non-null float64
processing_min      62209 non-null float64
quantity            62209 non-null float64
tags                62209 non-null object
taxonomy_path       62209 non-null object
views               62209 non-null float64
when_made           62209 non-null object
who_made            62209 non-null object
price_usd           62209 non-null float64
contains_silver     62209 non-null int64
contains_gold       62209 non-null int64
contains_glass      62209 non-null int64
contains_diamond    62209 non-null int64
contains_pearl      62209 non-null int64
contains_stone      62209 non-null int64
dtypes: float64(6), int64(7), object(5)
memory usage: 9.0+ MB


### One Hot Encode - Occasion

In [153]:
test_df.occasion.value_counts()

None                  59819
wedding                 509
birthday                452
engagement              402
anniversary             208
christmas               208
valentines              124
mothers_day             122
get_well                 61
sympathy                 49
new_baby                 35
graduation               28
halloween                27
easter                   20
new_years                19
fathers_day              18
sweet_16                 18
chinese_new_year         15
day_of_the_dead          15
july_4th                  9
prom                      7
baptism                   7
st_patricks_day           5
confirmation              5
canada_day                4
eid                       4
thanksgiving              4
hanukkah                  3
cinco_de_mayo             3
quinceanera               3
housewarming              2
kwanzaa                   2
retirement                1
bar_or_bat_mitzvah        1
Name: occasion, dtype: int64

In [157]:
occasion_df = pd.get_dummies(test_df.occasion)

In [158]:
occasion_df = occasion_df.drop(columns=['None', 'baptism', 'bar_or_bat_mitzvah',
       'canada_day', 'chinese_new_year', 'cinco_de_mayo','confirmation', 'day_of_the_dead', 
        'easter', 'eid', 'fathers_day', 'get_well', 'graduation', 'halloween', 'hanukkah',
       'housewarming', 'july_4th', 'kwanzaa', 'new_baby',
       'new_years', 'prom', 'quinceanera', 'retirement', 'st_patricks_day',
       'sweet_16', 'sympathy', 'thanksgiving'])

In [159]:
# occasion_df.info()
# test_df_copy = test_df.copy()

test_df = pd.merge(test_df, occasion_df, left_index = True, right_index=True)


In [160]:
#can drop 'occasion' column

test_df = test_df.drop(columns=['occasion'])

### One Hot Encode - Tags Column ??? (DROP for now)

In [162]:
# test_df.tags.sample(20)
test_df = test_df.drop(columns=['tags'])

In [163]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62209 entries, 151842 to 15653
Data columns (total 23 columns):
is_customizable     62209 non-null int64
num_favorers        62209 non-null float64
processing_max      62209 non-null float64
processing_min      62209 non-null float64
quantity            62209 non-null float64
taxonomy_path       62209 non-null object
views               62209 non-null float64
when_made           62209 non-null object
who_made            62209 non-null object
price_usd           62209 non-null float64
contains_silver     62209 non-null int64
contains_gold       62209 non-null int64
contains_glass      62209 non-null int64
contains_diamond    62209 non-null int64
contains_pearl      62209 non-null int64
contains_stone      62209 non-null int64
anniversary         62209 non-null uint8
birthday            62209 non-null uint8
christmas           62209 non-null uint8
engagement          62209 non-null uint8
mothers_day         62209 non-null uint8
valentines

### One Hot Encode - Taxonomy

In [165]:
# test_df.taxonomy_path.value_counts()

taxonomy_df = pd.DataFrame(test_df.taxonomy_path.copy())

In [166]:
taxonomy_df = taxonomy_df.taxonomy_path.str.replace(', ',',').str.get_dummies(sep=',')

In [170]:
taxonomy_df = taxonomy_df.drop(columns=['Art & Collectibles', 'Bands', 'Barbells',
       'Beaded Necklaces', 'Beads', 'Belly Rings','Bib Necklaces','Cabochons', 'Cameo Necklaces',
       'Chain & Link Bracelets', 'Chains', 'Charms', 'Charms & Pendants', 'Claddagh Rings',
       'Cluster Earrings', 'Coins & Money', 'Collectible Glass',
       'Craft Supplies & Tools','Ear Climbers', 'Ear Jackets', 'Ear Jackets & Climbers', 'Ear Weights',
       'Engagement Rings', 'Findings', 'Fraternal & Class Rings',
       'Friendship Bracelets', 'Gauge & Plug Earrings', 'Gems & Cabochons',
       'Hoop Earrings', 'ID & Medical Bracelets', 'Jewelry Sets', 'Keychains',
       'Keychains & Lanyards', 'Lanyards & Badge Holders',
       'Lariat & Y Necklaces', 'Lockets', 'Midi Rings','Collectibles','Clasps & Clips','Cuff & Wrap Earrings',
       'Monogram & Name Necklaces', 'Multi Strand Necklaces','Pendants',
       'Multistone Rings', 'Nipple Jewelry', 'Nose Rings & Studs',
       'Paper & Party Supplies', 'Party Favors', 'Party Favors & Games',
       'Party Supplies', 'Patterns & Blueprints', 'Patterns & How To',
       'Pet Collars & Leashes', 'Pet ID Tags', 'Pet Supplies', 'Pins & Clips',
       'Promise Rings', 'Ring Guards & Spacers', 'Screw Back Earrings',
       'Signet Rings', 'Stackable Rings', 'Statement Rings',
       'Tassel Necklaces', 'Threader Earrings',
       'Triplet & Double Rings', 'Tutorials', 'Wedding & Engagement','Accessories','Anniversary Rings',
        'Jewelry','Body Jewelry',
       'Wedding Bands', 'Woven & Braided Bracelets'])

In [171]:
taxonomy_df.sum().sort_values()

Bridal Sets                 154
Chandelier Earrings         201
Clip On Earrings            389
Chokers                     421
Crystal Necklaces           428
Solitaire Rings             626
Charm Necklaces            1087
Bangles                    1151
Cuff Bracelets             1163
Charm Bracelets            1373
Stud Earrings              1491
Beaded Bracelets           3784
Dangle & Drop Earrings     5828
Anklets                    9170
Brooches                   9877
Rings                      9976
Bracelets                  9993
Earrings                   9995
Necklaces                 12944
dtype: int64

In [172]:
test_df = pd.merge(test_df, taxonomy_df, left_index = True, right_index=True)

In [173]:
#can drop taxonomy path

test_df = test_df.drop(columns=['taxonomy_path'])

In [174]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62209 entries, 151842 to 15653
Data columns (total 41 columns):
is_customizable           62209 non-null int64
num_favorers              62209 non-null float64
processing_max            62209 non-null float64
processing_min            62209 non-null float64
quantity                  62209 non-null float64
views                     62209 non-null float64
when_made                 62209 non-null object
who_made                  62209 non-null object
price_usd                 62209 non-null float64
contains_silver           62209 non-null int64
contains_gold             62209 non-null int64
contains_glass            62209 non-null int64
contains_diamond          62209 non-null int64
contains_pearl            62209 non-null int64
contains_stone            62209 non-null int64
anniversary               62209 non-null uint8
birthday                  62209 non-null uint8
christmas                 62209 non-null uint8
engagement                

In [132]:
test_df.when_made.value_counts()

standard         35206
made_to_order    14045
vintage          12958
Name: when_made, dtype: int64

In [178]:
kp = KPrototypes(n_clusters=5, init='Huang', n_init=1, verbose=True, n_jobs = -1)

kp.fit_predict(test_df, categorical = [0,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34])

print(kp.cluster_centroids_)
# print(kp.labels_)

[array([[4.23596059e+01, 5.05418719e+00, 2.91625616e+00, 2.10837438e+00,
        4.59847291e+02, 9.12215483e+03, 1.97044335e-02, 5.91133005e-02,
        4.92610837e-02, 7.33990148e-01, 1.47783251e-02, 9.85221675e-03],
       [2.77061111e+03, 6.87037037e+00, 3.83333333e+00, 5.69444444e+01,
        2.79427407e+04, 2.27419787e+02, 3.70370370e-02, 3.70370370e-02,
        1.11111111e-01, 3.14814815e-01, 0.00000000e+00, 0.00000000e+00],
       [5.38554302e+02, 6.38645980e+00, 3.63610719e+00, 3.59435825e+01,
        5.23533145e+03, 4.06209600e+02, 1.83356841e-02, 5.92383639e-02,
        1.52327221e-01, 3.99153738e-01, 1.12834979e-02, 1.41043724e-02],
       [4.02983333e+03, 6.33333333e+00, 4.33333333e+00, 4.20000000e+01,
        1.26330500e+05, 2.47720000e+02, 0.00000000e+00, 0.00000000e+00,
        1.66666667e-01, 3.33333333e-01, 0.00000000e+00, 0.00000000e+00],
       [2.17849339e+01, 3.67919722e+00, 1.93381452e+00, 1.25847445e+01,
        1.34329752e+02, 1.12358396e+02, 9.48609501e-02, 1.6

In [140]:
test_df_labeled = test_df.copy()

In [141]:
test_df_labeled['cluster_label'] = kp.labels_

In [144]:
test_df_labeled.cluster_label.value_counts()

4    61882
1      212
3      106
0        6
2        3
Name: cluster_label, dtype: int64

In [145]:
pd.to_pickle(test_df_labeled,'test_labeled_df.pkl')

In [146]:
test_df.head()

Unnamed: 0,is_customizable,num_favorers,processing_max,processing_min,quantity,views,when_made,who_made,price_usd,contains_silver,...,Chandelier Earrings,Charm Bracelets,Charm Necklaces,Chokers,Clip On Earrings,Crystal Necklaces,Cuff Bracelets,Dangle & Drop Earrings,Solitaire Rings,Stud Earrings
151842,0,8.0,3.0,1.0,4.0,514.0,standard,i_did,8.0,0,...,0,0,0,0,0,0,0,0,0,1
261700,0,11.0,1.0,1.0,1.0,34.0,vintage,someone_else,49.75,1,...,0,0,0,0,0,0,0,0,1,0
264540,0,8.0,1.0,1.0,1.0,49.0,vintage,someone_else,24.99,0,...,0,0,0,0,0,0,0,0,0,0
144297,1,5.0,2.0,1.0,20.0,25.0,standard,i_did,6.027,1,...,0,0,0,0,0,0,0,0,0,0
252974,1,46.0,10.0,5.0,2.0,359.0,made_to_order,i_did,67.32,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
plt.figure(figsize=(7,7))
plt.scatter(data_cat.x,data_cat.y, c = np.array(clusters), s = data_cat['size'], cmap= 'jet')
plt.title('clustered sensors - colors represent clusters', fontsize=14)
plt.xlim(-1,12), plt.ylim(-1,12)
plt.xlabel('x-coordinate', fontsize=14), plt.ylabel('y-coordinate', fontsize=14)
plt.show()