## Clustering graph using document based clustering approach

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import csv
import os

### Source Corpus data from disc

In [2]:
path_folder = r"C:\Titan\Data\PartitionServiceData"
data_file = r"\RUSH_05Oct2017_TopLevelTradeDependency_16Nov2017.csv"
#file_path = os.path.join(path_folder, data_file)
file_path = r"C:\Titan\Data\PartitionServiceData\RUSH_05Oct2017_TopLevelTradeDependency_16Nov2017.csv"
def createCorpus(file_path):
    corpus = []
    with open(file_path, 'rb') as csv_file:
        data = csv.reader(csv_file, delimiter='|')
        for row in data:
            row1 = [string.replace(s, ',', '_') for s in row]
            row2 = [string.replace(s, ' ', '_') for s in row1]
            document = ' '.join(row2) 
            corpus.append(document)
            #print(row2)
        print('corpus: {}'.format(corpus))


In [4]:
documents = ["discountCurve_USD, Funding, Time_1, Time_2",
            "discountCurve_EUR, Funding, Time_1, Time_2, bondDefaultCurve",
             "discountCurve_GBP, Funding, defaultCurve, Time_1, Time_2",
             "discountCurve_EUR, Funding, Time_1, Time_2, bondDefaultCurve, recovery",
             "discountCurve_EUR, Funding, Time_1, Time_2, loanDefaultCurve",
             "discountCurve_USD, Funding, Time_1, Time_2",
            "discountCurve_EUR, Funding, Time_1, Time_2, bondDefaultCurve",
             "discountCurve_GBP, Funding, defaultCurve, Time_1, Time_2",
             "discountCurve_EUR, Funding, Time_1, Time_2, bondDefaultCurve, recovery",
             "discountCurve_EUR, Funding, Time_1, Time_2, loanDefaultCurve",
            ]
#documents = createCorpus(file_path)

### Vectorize the corpus data to convert the data from text to numeric format

In [5]:
vectorizer = TfidfVectorizer(stop_words='english',lowercase=False)
X = vectorizer.fit_transform(documents)

### Display the Document Term Matrix

In [6]:
X.toarray()

array([[ 0.34738346,  0.34738346,  0.34738346,  0.        ,  0.        ,
         0.        ,  0.        ,  0.79873287,  0.        ,  0.        ],
       [ 0.34696212,  0.34696212,  0.34696212,  0.62052696,  0.        ,
         0.50378384,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.2714286 ,  0.2714286 ,  0.2714286 ,  0.        ,  0.62409116,
         0.        ,  0.62409116,  0.        ,  0.        ,  0.        ],
       [ 0.27122747,  0.27122747,  0.27122747,  0.48507876,  0.        ,
         0.39381825,  0.        ,  0.        ,  0.        ,  0.6236287 ],
       [ 0.31016195,  0.31016195,  0.31016195,  0.        ,  0.        ,
         0.45035054,  0.        ,  0.        ,  0.71315011,  0.        ],
       [ 0.34738346,  0.34738346,  0.34738346,  0.        ,  0.        ,
         0.        ,  0.        ,  0.79873287,  0.        ,  0.        ],
       [ 0.34696212,  0.34696212,  0.34696212,  0.62052696,  0.        ,
         0.50378384,  0.        ,  0.        

### Now cluster the data - using kmeans

In [7]:
true_k = 4
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

### Display the computed clusters

In [8]:
model.cluster_centers_.argsort()

array([[4, 6, 7, 8, 0, 1, 2, 9, 5, 3],
       [3, 5, 7, 8, 9, 0, 1, 2, 4, 6],
       [3, 4, 5, 6, 8, 9, 0, 1, 2, 7],
       [3, 4, 6, 7, 9, 0, 1, 2, 5, 8]])

In [21]:
model.cluster_centers_.argsort()[:, ::-1]

array([[8, 5, 2, 1, 0, 9, 7, 6, 4, 3],
       [3, 5, 9, 2, 1, 0, 8, 7, 6, 4],
       [6, 4, 2, 1, 0, 9, 8, 7, 5, 3],
       [7, 2, 1, 0, 9, 8, 6, 5, 4, 3]], dtype=int64)

In [9]:
model.labels_

array([2, 0, 1, 0, 3, 2, 0, 1, 0, 3], dtype=int32)

In [13]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print "Cluster %d:" % i,
    for ind in order_centroids[i, :10]:
        print ' %s' % terms[ind],
    print

Top terms per cluster:
Cluster 0:  bondDefaultCurve  discountCurve_EUR  Time_2  Time_1  Funding  recovery  loanDefaultCurve  discountCurve_USD  discountCurve_GBP  defaultCurve
Cluster 1:  discountCurve_USD  Time_2  Time_1  Funding  recovery  loanDefaultCurve  discountCurve_GBP  discountCurve_EUR  defaultCurve  bondDefaultCurve
Cluster 2:  discountCurve_GBP  defaultCurve  Time_2  Time_1  Funding  recovery  loanDefaultCurve  discountCurve_USD  discountCurve_EUR  bondDefaultCurve
Cluster 3:  loanDefaultCurve  discountCurve_EUR  Time_2  Time_1  Funding  recovery  discountCurve_USD  discountCurve_GBP  defaultCurve  bondDefaultCurve


In [None]:
import csv
def createCorpus():
    corpus = []
    with open("myRawData.dat", 'rb') as csv_file:
        data = csv.reader(csv_file, delimiter='|')
        for row in data:
            row1 = [string.replace(s, ',', '_') for s in row]
            row2 = [string.replace(s, ' ', '_') for s in row1]
            document = ' '.join(row2) 
            corpus.append(document)
            #print(row2)
        print('corpus: {}'.format(corpus))
createCorpus()