### По корпусу 20newsgroups

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.cluster
import sklearn.decomposition

In [3]:
corpus = fetch_20newsgroups()
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus.data) # Transform corpus into tf-idf matrix.

In [4]:
SVD = sklearn.decomposition.TruncatedSVD(20)
SVD.fit(tfidf_matrix)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
       random_state=None, tol=0.0)

In [5]:
documents = SVD.transform(tfidf_matrix)
words = SVD.components_

### Классификация тем с использованием 2-ой матрицы SVD-разложения (20 скрытых тем)

In [6]:
from sklearn import cross_validation
from sklearn import ensemble

In [7]:
forest = ensemble.RandomForestClassifier(n_estimators=500, n_jobs=2, warm_start=True)
forest.fit(documents, corpus.target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2,
            oob_score=False, random_state=None, verbose=0, warm_start=True)

In [8]:
tolerance = cross_validation.cross_val_score(forest, documents, corpus.target)

In [9]:
print tolerance

[ 0.6410596   0.66949377  0.66038237]


### Наиболее популярные слова, соответствующие каждой из 20-ти скрытых тем

In [10]:
most_popular_words = []
for theme in range(words.shape[0]):
    most_popular_words.append([tfidf.get_feature_names()[words[theme, :].argsort()[i]] for i in range(-1, -6, -1)])
    print(theme, most_popular_words[theme])

(0, [u'the', u'to', u'of', u'and', u'in'])
(1, [u'edu', u'windows', u'com', u'for', u'host'])
(2, [u'the', u'of', u'was', u'armenian', u'on'])
(3, [u'edu', u'he', u'was', u'his', u'in'])
(4, [u'edu', u'of', u'cs', u'pitt', u'geb'])
(5, [u'com', u'you', u'they', u'key', u'clipper'])
(6, [u'pitt', u'geb', u'banks', u'gordon', u'cs'])
(7, [u'he', u'god', u'key', u'clipper', u'encryption'])
(8, [u'com', u'sandvik', u'geb', u'gordon', u'pitt'])
(9, [u'scsi', u'drive', u'the', u'ide', u'keith'])
(10, [u'nasa', u'keith', u'is', u'space', u'caltech'])
(11, [u'scsi', u'nasa', u'gov', u'drive', u'space'])
(12, [u'scsi', u'ca', u'of', u'you', u'are'])
(13, [u'keith', u'caltech', u'and', u'we', u'sgi'])
(14, [u'israel', u'israeli', u'he', u'windows', u'cleveland'])
(15, [u'ohio', u'state', u'magnus', u'acs', u'is'])
(16, [u'uk', u'ac', u'___', u'ohio', u'ca'])
(17, [u'columbia', u'cc', u'gld', u'cunixb', u'dare'])
(18, [u'ohio', u'magnus', u'state', u'acs', u'to'])
(19, [u'sandvik', u'apple', u'ke