## Fetch Data

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

print(f'Train: {len(newsgroups_train.data)} docs in {len(newsgroups_train.target_names)} categories')
print(f'Test: {len(newsgroups_test.data)} docs in {len(newsgroups_test.target_names)} categories')


  from collections import Mapping, defaultdict


Train: 11314 docs in 20 categories
Test: 7532 docs in 20 categories



## Convert to vectors

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
print(train_vectors.shape)

test_vectors = vectorizer.transform(newsgroups_test.data)
print(test_vectors.shape)

(11314, 130107)
(7532, 130107)


In [3]:
# The extracted TF-IDF vectors are very sparse
# with an average of 159 non-zero components by sample
# in a more than 30000-dimensional space
# (less than .5% non-zero features):

print(train_vectors.nnz)
print(float(train_vectors.shape[0]))
print(train_vectors.nnz / float(train_vectors.shape[0]))

1787565
11314.0
157.9958458546933


## Train a classifier

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB(alpha=.01)
clf.fit(train_vectors, newsgroups_train.target)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [5]:
pred = clf.predict(test_vectors)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.8290659644474043

In [6]:
train_vectors.shape

(11314, 130107)

In [36]:
for y in range(10):
    row = train_vectors[y].toarray()[0]
    print(np.mean(row))


5.9165265766042644e-05
5.8067837292879214e-05
9.009060506818681e-05
5.6287690577184214e-05
7.60018730309358e-05
7.928935690197734e-05
5.480592404833413e-05
6.263754136314254e-05
4.108598617094371e-05
7.016866390261056e-05
