In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
#Lets assume a sample DTM with 10 documents
#containing 2 terms after cleaning
X = np.array([[5,3],  
    [10,15],
    [15,12],
    [24,10],
    [30,30],
    [85,70],
    [71,80],
    [60,78],
    [70,55],
    [80,91]])

In [3]:
DTM = pd.DataFrame(X,columns=['t1','t2'])
DTM

Unnamed: 0,t1,t2
0,5,3
1,10,15
2,15,12
3,24,10
4,30,30
5,85,70
6,71,80
7,60,78
8,70,55
9,80,91


In [4]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2,random_state=0) 
y_kmeans = kmeans.fit_predict(X)

In [5]:
DTM['cluster'] = y_kmeans
DTM

Unnamed: 0,t1,t2,cluster
0,5,3,1
1,10,15,1
2,15,12,1
3,24,10,1
4,30,30,1
5,85,70,0
6,71,80,0
7,60,78,0
8,70,55,0
9,80,91,0


In [6]:
print(y_kmeans)  

[1 1 1 1 1 0 0 0 0 0]


In [7]:
kmeans.cluster_centers_

array([[73.2, 74.8],
       [16.8, 14. ]])

# inference
- From Cluster centers we will be able to infer the dominating term in that cluster
  here we have 2 clusters :
    1. the first cluster is at co-ordinate (73.2,74.8)
    2. The second cluster is at co-ordinate (16.8,14.0)
- These co-ordinates here are our features, which are terms in our corpus and thier frequncies counts are the ones which are clustered
- These centroid values are the centorid of the frequencies of the respective terms
Eg: in the first cluster
    - 73.2 is the centeroid of all the term-1 frequencies 
    - 74.8 is the centeroid of all the term-2 frequencies
Since centerid of term-1 > centeroid of term-2 in cluster 1 -> t2 is dominating in terms of its frequency count
and similarly in cluster 2-> t1 is dominating

In [8]:
#Get the index of the cluster centers in reverse order for every cluster
kmeans.cluster_centers_.argsort()[:,::-1]

array([[1, 0],
       [0, 1]], dtype=int64)

In [9]:
BOW = ['t1','t2']
num_clusters=2

In [10]:
print("Top  term per cluster:")
#Get the index of the cluster centers in reverse order for every cluster
#Step:1
order_centroids_arg = kmeans.cluster_centers_.argsort()[:,::-1]
#Step:2:
for i in range(num_clusters):
    print ("Cluster :",  i)
    for ind in order_centroids_arg[i, : 1]:
        #To get the dominating term per cluster
        print (BOW[ind])

Top  term per cluster:
Cluster : 0
t2
Cluster : 1
t1
