In [1]:
__author__ = 'Ksenia Voronaya'

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import numpy

In [2]:
# it is difficult to handle all documents & categories after, so download only several
corpus_data = fetch_20newsgroups(categories=['rec.motorcycles', 'rec.autos', 
                                             'rec.sport.hockey', 'soc.religion.christian', 
                                             'alt.atheism', 'sci.electronics'])

In [3]:
print("List of topics ({} categories):".format(len(list(corpus_data.target_names))))
print(list(corpus_data.target_names))
print("Size of corpus is {} documents".format(len(corpus_data.data)))

List of topics (6 categories):
['alt.atheism', 'rec.autos', 'rec.motorcycles', 'rec.sport.hockey', 'sci.electronics', 'soc.religion.christian']
Size of corpus is 3462 documents


In [4]:
# TF-IDF matrix for corpus 
corpus_tfidf = TfidfVectorizer(stop_words='english')
corpus_representation = corpus_tfidf.fit_transform(corpus_data.data)

# Dimensionality reduction using truncated SVD
# try to use 15 components
svd = TruncatedSVD(n_components=15)
data_svd = svd.fit_transform(corpus_representation)
print(svd.components_)

[[  1.84944570e-02   1.67375701e-02   2.97805979e-04 ...,   4.39221452e-04
    5.29853162e-04   5.29853162e-04]
 [ -2.84715071e-03  -1.59567474e-02  -3.19524066e-04 ...,  -7.21174191e-04
   -4.69511893e-04  -4.69511893e-04]
 [ -1.11635734e-02   2.45604563e-03   1.28018353e-05 ...,  -3.24480562e-04
   -5.43650908e-05  -5.43650908e-05]
 ..., 
 [ -1.03325826e-02   1.24346316e-02  -6.53706211e-05 ...,   2.60973153e-04
   -1.88161994e-03  -1.88161994e-03]
 [ -6.44741698e-03   9.22520428e-03   1.13662500e-05 ...,  -6.39615557e-04
    2.27248282e-04   2.27248282e-04]
 [ -1.39659015e-02  -8.07024361e-03  -5.19119109e-04 ...,   5.58527262e-04
   -2.33685226e-03  -2.33685226e-03]]


In [5]:
# split data on train(75%) & test(25%) datasets
X_train, X_test, y_train, y_test = train_test_split(data_svd, corpus_data.target, test_size = 0.25, random_state = 1)

print("Size of train data is {} documents".format(len(X_train)))
print("Size of test data is {} documents".format(len(X_test)))

Size of train data is 2596 documents
Size of test data is 866 documents


In [6]:
# classification with kNN
knn_classifier = KNeighborsClassifier().fit(X_train, y_train)

y_train_predict = knn_classifier.predict(X_train)
y_test_predict = knn_classifier.predict(X_test)

err_train = numpy.mean(y_train != y_train_predict)
err_test  = numpy.mean(y_test  != y_test_predict)

print("The average error on train data is {}".format(err_train))
print("The average error on test data is {}".format(err_test))

The average error on train data is 0.0681818181818
The average error on test data is 0.108545034642


In [7]:
for component_num, topic in enumerate(svd.components_):
    w_len = numpy.argsort(topic)
    print('*******************************************************************************')
    print("Hidden category (component {}), twelve popular words:".format(component_num+1))
    print(numpy.asarray(corpus_tfidf.get_feature_names())[w_len[-13:-1]][:])

*******************************************************************************
Hidden category (component 1), twelve popular words:
[u'organization' u'ca' u'like' u'lines' u'think' u'subject' u'article'
 u'don' u'people' u'writes' u'com' u'god']
*******************************************************************************
Hidden category (component 2), twelve popular words:
[u'hell' u'church' u'atheists' u'christ' u'truth' u'christian'
 u'christians' u'people' u'faith' u'bible' u'believe' u'jesus']
*******************************************************************************
Hidden category (component 3), twelve popular words:
[u'moral' u'allan' u'com' u'cco' u'schneider' u'jon' u'morality' u'wpd'
 u'solntze' u'sgi' u'livesey' u'caltech']
*******************************************************************************
Hidden category (component 4), twelve popular words:
[u'insurance' u'sun' u'article' u'oil' u'engine' u'nec' u'behanna' u'hp'
 u'dod' u'cars' u'bike' u'car']
********