## Simple demo of Kmeans Clustering for Top-level descriptors

### Some imported modules

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import csv
import os
import string
import numpy as np

In [12]:
feature_space_dict = {
'RUSH-XS1405766541':
  'DiscountCurve_USD,Funding_USD,Time_0,Time_1,Time_2,BondDefaultCurve_ISIN_XS1405766541,BondRecovery_ISIN_XS1405766541',
 'RUSH-XS0274618247':
  'DiscountCurve_USD,Funding_USD,Time_0,Time_1,Time_2,BondReferenceAsset_ISIN_XS0274618247,BondDefaultCurve_ISIN_XS0274618247,BondRecovery_ISIN,XS0274618247',
 'RUSH-XS1396282177':
  'DiscountCurve_USD,Funding_USD,Time_0,Time_1,Time_2,BondReferenceAsset_ISIN_XS1396282177,BondDefaultCurve_ISIN,XS1396282177_BondRecovery_ISIN,XS1396282177',
 '501899428':
 'DiscountCurve_USD,Funding_USD,Time_0,Time_1,Time_2',
 '501899864':
 'DiscountCurve_USD,Funding_USD,Time_0,Time_1,Time_2',
 '501901588':
 'DiscountCurve_USD,Funding_USD,Time_0,Time_1,Time_2'
}

def replaceCommaWithSpace(data):
    data2 = {} 
    for key in data.keys():
        data2[key] = string.replace(data[key], ",", " ")
    return data2
feature_space_dict = replaceCommaWithSpace(feature_space_dict)

tcns = feature_space_dict.keys()
descriptors = feature_space_dict.values()

In [13]:
print("TCNs: \n{}".format(tcns))
print("\n\ndescriptors: \n{}".format(descriptors))


TCNs: 
['RUSH-XS0274618247', 'RUSH-XS1405766541', '501899428', '501901588', 'RUSH-XS1396282177', '501899864']


descriptors: 
['DiscountCurve_USD Funding_USD Time_0 Time_1 Time_2 BondReferenceAsset_ISIN_XS0274618247 BondDefaultCurve_ISIN_XS0274618247 BondRecovery_ISIN XS0274618247', 'DiscountCurve_USD Funding_USD Time_0 Time_1 Time_2 BondDefaultCurve_ISIN_XS1405766541 BondRecovery_ISIN_XS1405766541', 'DiscountCurve_USD Funding_USD Time_0 Time_1 Time_2', 'DiscountCurve_USD Funding_USD Time_0 Time_1 Time_2', 'DiscountCurve_USD Funding_USD Time_0 Time_1 Time_2 BondReferenceAsset_ISIN_XS1396282177 BondDefaultCurve_ISIN XS1396282177_BondRecovery_ISIN XS1396282177', 'DiscountCurve_USD Funding_USD Time_0 Time_1 Time_2']


In [14]:
vectorizer = TfidfVectorizer(stop_words='english',lowercase=False)
X = vectorizer.fit_transform(descriptors)


In [19]:
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=3, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [20]:
clusters = model.labels_
tcn_clusters = zip(tcns, clusters)
print("TCNs aaignment to clusters are:\n {}".format(tcn_clusters))

TCNs aaignment to clusters are:
 [('RUSH-XS0274618247', 2), ('RUSH-XS1405766541', 1), ('501899428', 1), ('501901588', 1), ('RUSH-XS1396282177', 0), ('501899864', 1)]
