In [2]:
import numpy as np
import matplotlib.pyplot as plt


%matplotlib inline

In [3]:
import scipy as sp

$A$ is tf-idf weighed term-document matrix 

Terms:

- $t_1$: Information
- $t_2$: Singular
- $t_3$: Value
- $t_4$: Computation
- $t_5$: Retrieval

Documents:

- $d_1$: Large Scale **Singular Value Computations**
- $d_2$: Software for the Sparse **Singular Value** Decomposition
- $d_3$: Introduction to Moderm **Information Retrieval**
- $d_4$: Linear Algebra for Intelligent **Information Retrieval**
- $d_5$: Matrix **Computations**
- $d_6$: **Singular Value** Analysis of Cryptograms
- $d_7$: Automatic **Information** Organization


In [3]:
A = np.matrix([[0.00, 0.00, 0.56, 0.56, 0.00, 0.00, 1.00],
               [0.49, 0.71, 0.00, 0.00, 0.00, 0.71, 0.00],
               [0.49, 0.71, 0.00, 0.00, 0.00, 0.71, 0.00],
               [0.72, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00],
               [0.00, 0.00, 0.83, 0.83, 0.00, 0.00, 0.00]])

In [4]:
S, U = np.linalg.eigh(A * A.T)

order = S.argsort()[::-1]
S = S[order]
print np.round(S, decimals=2)

U = U[:, order]

[ 2.71  2.44  1.31  0.56  0.  ]


In [5]:
print np.round(U, decimals=2)

[[ 0.   -0.75  0.   -0.66  0.  ]
 [-0.65  0.   -0.27  0.   -0.71]
 [-0.65  0.   -0.27  0.    0.71]
 [-0.39  0.    0.92  0.    0.  ]
 [ 0.   -0.66  0.    0.75  0.  ]]


The first two columns are negative, but we want them be positive. from the orthogonality point of view there's no difference if we change the direction $180^\text{o}$, so let's do this for first two eigenvectors (and double check that it remains orthonormal)

In [6]:
U[:, 0:2]  = - U[:, 0:2] 
print np.round(U, decimals=2)
print np.round(U.T.dot(U), decimals=2)

[[ 0.    0.75  0.   -0.66  0.  ]
 [ 0.65  0.   -0.27  0.   -0.71]
 [ 0.65  0.   -0.27  0.    0.71]
 [ 0.39  0.    0.92  0.    0.  ]
 [ 0.    0.66  0.    0.75  0.  ]]
[[ 1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  1.]]


In [7]:
Sr = S[S > 1e-6]
r = len(Sr)
Ur = U[:, 0:r]

In [8]:
Vr = A.T.dot(Ur).dot(np.diag(1 / Sr))

In [9]:
np.round(Ur.dot(np.diag(Sr)).dot(Vr.T), decimals=2)

array([[ 0.  ,  0.  ,  0.56,  0.56,  0.  ,  0.  ,  1.  ],
       [ 0.49,  0.71,  0.  ,  0.  ,  0.  ,  0.71,  0.  ],
       [ 0.49,  0.71,  0.  ,  0.  ,  0.  ,  0.71,  0.  ],
       [ 0.72,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.83,  0.83,  0.  ,  0.  ,  0.  ]])

In [10]:
k = np.where(S.cumsum() / S.sum() > 0.9)[0][0]
k

2

In [11]:
Uk = U[:, 0:k]

Now suppose we have the following phrases:

- $p_1$ Singular Value 
- $p_2$ Information Retrieval

We treat these phrases as pseudo-documents. 


I.e. since we have terms:

- $t_1$: Information
- $t_2$: Singular
- $t_3$: Value
- $t_4$: Computation
- $t_5$: Retrieval

We create term-phrase martix $P$, apply tf-idf weights and normalize

In [12]:
P = np.matrix([[0.00, 0.56],
               [0.71, 0.00],
               [0.71, 0.00],
               [0.00, 0.00],
               [0.00, 0.83]])

Also, apart from phrases, we use the original terms as well:

In [13]:
P = np.hstack([P, np.eye(5)])
print P

[[ 0.    0.56  1.    0.    0.    0.    0.  ]
 [ 0.71  0.    0.    1.    0.    0.    0.  ]
 [ 0.71  0.    0.    0.    1.    0.    0.  ]
 [ 0.    0.    0.    0.    0.    1.    0.  ]
 [ 0.    0.83  0.    0.    0.    0.    1.  ]]


To find descriptions of the clusters (i.e. columns of $U$), calculate $M = U_k^T P$

Rows of $M$ represent clusters, columns - their descriptions

In [14]:
M = Uk.T * P
print np.round(M, decimals=2)

[[ 0.93  0.    0.    0.65  0.65  0.39  0.  ]
 [ 0.    0.97  0.75  0.    0.    0.    0.66]]


For each row need to select the column with the highest value:

In [15]:
desc = np.array(M.argmax(axis=1)).flatten()
desc

array([0, 1], dtype=int64)

So column 0 and 1 of $P$ give the descriptions: "Singular Value" (score 0.93) and "Information Retrieval" (score 0.97)

Matrix $Q$ contains columns of $P$ that correspond to selected labels 

In [16]:
Q = P[:, desc]
Q

matrix([[ 0.  ,  0.56],
        [ 0.71,  0.  ],
        [ 0.71,  0.  ],
        [ 0.  ,  0.  ],
        [ 0.  ,  0.83]])

Then the cluster assignment matrix $C$ is $C = Q^T A$

In [17]:
C = Q.T * A
C

matrix([[ 0.6958,  1.0082,  0.    ,  0.    ,  0.    ,  1.0082,  0.    ],
        [ 0.    ,  0.    ,  1.0025,  1.0025,  0.    ,  0.    ,  0.56  ]])

In $C$ for each column we select row with largest score for the cluster assignment. If the score is low (below some threshold), then we don't assing anything

In [18]:
t = 0.6
C = np.vstack([np.zeros(7), C])
C[C < t] = 0
C

matrix([[ 0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ],
        [ 0.6958,  1.0082,  0.    ,  0.    ,  0.    ,  1.0082,  0.    ],
        [ 0.    ,  0.    ,  1.0025,  1.0025,  0.    ,  0.    ,  0.    ]])

In [19]:
assignemnt = np.array(C.argmax(axis=0)).flatten()

Documetns assingned to default cluster are with 0

In [20]:
for i in np.unique(assignemnt):
    print "cluster %d:" % i, np.where(assignemnt == i)[0]

cluster 0: [4 6]
cluster 1: [0 1 5]
cluster 2: [2 3]


## 20 newsgroups

In [101]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD, randomized_svd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics import pairwise

In [5]:
dataset = fetch_20newsgroups(subset='all', categories=None, shuffle=True, random_state=42)

In [69]:
vectorizer = TfidfVectorizer(min_df=2, max_features=10000, stop_words='english')
# result of vectorizer is normalized

In [70]:
tfidf = vectorizer.fit_transform(dataset.data)

In [71]:
for i in range(10):
    print (tfidf[i] * tfidf[i].T).sum(), 

1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0


In [118]:
normalizer = Normalizer(copy=False)
# but result of svd is not

In [141]:
k = len(np.unique(dataset.target))
k

20

In [None]:
k=40

In [178]:
U, S, Vt = randomized_svd(tfidf.T, n_components=k)

In [179]:
print U.shape, S.shape, Vt.shape

(10000L, 20L) (20L,) (20L, 18846L)


In [180]:
np.allclose(U.T.dot(U), np.eye(k))

True

In [181]:
U_neg = U.sum(axis=0) < 0
U[:, U_neg] = -U[:, U_neg]

In [182]:
np.allclose(U.T.dot(U), np.eye(k))

True

Terms contributing to topics:

In [187]:
terms = vectorizer.get_feature_names()
# U here is term-term matrix because we did svd on tfidf.T - i.e. on term-doc matrix, not doc-term

for topic_id in xrange(k):
    print 'topic #%d' % topic_id
    topic = U[:, topic_id]
    indices = np.abs(topic).argsort()[::-1][:10]
    contribution = topic[indices] * 100
    for idx, contrib in zip(np.nditer(indices), np.nditer(contribution)):
        print '%s: %0.2f,' % (terms[idx], contrib),
    print
    print

topic #0
like: 28.83, posting: 25.04, people: 13.01, organization: 12.91, lines: 12.28, subject: 12.27, writes: 12.16, article: 10.93, com: 10.72, edu: 10.59,

topic #1
mac: 24.59, scsi: -18.04, 00: -16.30, file: 15.18, state: -11.60, disk: 10.40, people: -9.55, drive: -9.42, windows: -9.41, god: -9.13,

topic #2
uiuc: 42.63, stanford: -30.29, stratus: 22.24, hp: -17.09, key: 11.01, clipper: 10.19, university: 9.54, netcom: 8.87, edu: -8.75, com: -8.60,

topic #3
key: -18.54, com: -17.89, access: 16.83, gov: -16.59, uk: 16.43, car: 15.09, cwru: -13.87, god: -13.47, cleveland: -13.06, nasa: 12.47,

topic #4
gov: -20.32, apple: 17.99, hp: 15.77, uk: 15.42, space: 14.34, dos: 14.15, ac: 13.27, windows: -12.63, nasa: -12.49, edu: 12.45,

topic #5
vs: 21.01, hockey: 15.65, sgi: -13.38, com: 12.11, apple: -10.94, au: -10.65, team: -9.87, sandvik: -9.47, ca: 9.12, game: 8.96,

topic #6
msg: 17.53, key: 15.05, drive: -14.27, god: -13.92, digex: -13.25, cc: -12.19, access: 12.18, uk: 11.67, ca:

## K-Means clustering

In [196]:
components = 200
lsa = TruncatedSVD(components)

In [197]:
X = lsa.fit_transform(tfidf)
X = normalizer.fit_transform(X)

In [192]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [198]:
km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000)

In [199]:
km.fit(X)

MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=20,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [200]:
labels = dataset.target
cluster_labels = km.labels_

print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, cluster_labels)
print "Completeness: %0.3f" % metrics.completeness_score(labels, cluster_labels)
print "V-measure: %0.3f" % metrics.v_measure_score(labels, cluster_labels)
print "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, cluster_labels)
print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, cluster_labels, sample_size=1000)
print

Homogeneity: 0.368
Completeness: 0.399
V-measure: 0.383
Adjusted Rand-Index: 0.194
Silhouette Coefficient: 0.042



In [202]:
order_centroids = km.cluster_centers_
order_centroids = lsa.inverse_transform(order_centroids)

In [204]:
print "Top terms per cluster:"

order_centroids = order_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(k):
    print "Cluster %d:" % i,
    for ind in order_centroids[i, :10]:
        print ' %s' % terms[ind],
    print

Top terms per cluster:
Cluster 0:  space  shuttle  alaska  edu  nasa  moon  launch  orbit  henry  sci
Cluster 1:  edu  game  team  games  year  ca  university  players  hockey  baseball
Cluster 2:  sale  00  edu  10  offer  new  distribution  subject  lines  shipping
Cluster 3:  israel  israeli  jews  arab  jewish  arabs  edu  jake  peace  israelis
Cluster 4:  cmu  andrew  org  com  stratus  edu  mellon  carnegie  pittsburgh  pa
Cluster 5:  god  jesus  christian  bible  church  christ  christians  people  edu  believe
Cluster 6:  drive  scsi  card  edu  mac  disk  ide  bus  pc  apple
Cluster 7:  com  ca  hp  subject  edu  lines  organization  writes  article  like
Cluster 8:  car  cars  com  edu  engine  ford  new  dealer  just  oil
Cluster 9:  sun  monitor  com  video  edu  vga  east  card  monitors  microsystems
Cluster 10:  nasa  gov  jpl  larc  gsfc  jsc  center  fnal  article  writes
Cluster 11:  windows  dos  file  edu  ms  files  program  os  com  use
Cluster 12:  netcom  com  e