In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(categories=['comp.sys.mac.hardware', 'rec.sport.hockey', 'sci.space', 'rec.motorcycles'])

In [3]:
tfidf = TfidfVectorizer(stop_words='english')
corpus = tfidf.fit_transform(data.data)
svd = TruncatedSVD(n_components=10)
X_svd = svd.fit_transform(corpus)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_svd, data.target, test_size = 0.3, random_state = 1) 

knn = KNeighborsClassifier().fit(X_train, y_train)

y_train_predict = knn.predict(X_train)
y_test_predict = knn.predict(X_test)

err_train = np.mean(y_train != y_train_predict)
err_test  = np.mean(y_test  != y_test_predict)

print (err_train, err_test)

(0.031966224366706875, 0.04360056258790436)


In [5]:
for com_num, topic in enumerate(svd.components_):
    print('===============================================================================')
    print("Hidden category (component {}), twelve popular words:".format(com_num+1))
    print(np.asarray(tfidf.get_feature_names())[np.argsort(topic)[-13:-1]][:])

Hidden category (component 1), twelve popular words:
[u'university' u'just' u'don' u'like' u'lines' u'organization' u'subject'
 u'writes' u'article' u'space' u'ca' u'com']
Hidden category (component 2), twelve popular words:
[u'aurora' u'launch' u'shuttle' u'orbit' u'digex' u'moon' u'com' u'access'
 u'alaska' u'gov' u'henry' u'nasa']
Hidden category (component 3), twelve popular words:
[u'zoology' u'aurora' u'jpl' u'spencer' u'team' u'zoo' u'moon' u'gov'
 u'alaska' u'toronto' u'henry' u'nasa']
Hidden category (component 4), twelve popular words:
[u'zoo' u'keenan' u'toronto' u'espn' u'alaska' u'henry' u'dare' u'gary'
 u'cc' u'cunixb' u'columbia' u'gld']
Hidden category (component 5), twelve popular words:
[u'helmet' u'sun' u'nec' u'behanna' u'dod' u'gary' u'dare' u'bike' u'cc'
 u'cunixb' u'com' u'columbia']
Hidden category (component 6), twelve popular words:
[u'man' u'edu' u'kipling' u'eng' u'utzoo' u'umd' u'work' u'alaska'
 u'zoology' u'spencer' u'zoo' u'toronto']
Hidden category (com