In [1]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

In [2]:
# Load the list of files matching those categories as follows:

In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(
                  subset='train', categories=categories,
                  shuffle=True, random_state=42)

In [4]:
# Did we get what we wanted?
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
# Size
len(twenty_train.data)

2257

In [6]:
# Lets see what we have :
print("\n".join(twenty_train.data[0].split("\n")[:2]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?


In [7]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [8]:
print("\n".join(twenty_train.data[1000].split("\n")[:2]))

From: bobs@thnext.mit.edu (Robert Singleton)
Subject: Re: Americans and Evolution


In [9]:
print(twenty_train.target_names[twenty_train.target[1000]])

alt.atheism


In [10]:
#scikit-learn provides basic tools to process text using the Bag of Words representation.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# Punctuation and single letter words will be automatically removed.

In [13]:
count_vect = CountVectorizer(stop_words='english')

In [14]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [15]:
 X_train_counts.shape

(2257, 35482)

In [16]:
#Occurrence count is a good start but there is an issue: 
#longer documents will have higher average count values than shorter documents, 
#even though they might talk about the same topics.

#To avoid these potential discrepancies it suffices to divide the number 
#of occurrences of each word in a document by the total number of words in the document: 
#these new features are called “tf” for Term Frequencies.

#Another refinement on top of tf is to downscale weights for words 
#that occur in many documents in the corpus and are therefore less 
#informative than those that occur only in a smaller portion of the corpus.

#This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
tfidf_transformer = TfidfTransformer()

In [19]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [20]:
# Import the SKlearn model we are using
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                      hidden_layer_sizes=(10, 10,10), random_state=1)
clf.fit(X_train_tfidf, twenty_train.target)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(10, 10, 10), random_state=1,
              solver='lbfgs')

In [37]:
docs_new = ['Doctors are bad', 'OpenGL on the GPU is fast']

In [38]:
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [39]:
predicted = clf.predict(X_new_tfidf)

In [40]:
for doc, category in zip(docs_new, predicted):
   print('%r => %s' % (doc, twenty_train.target_names[category]))

'Doctors are bad' => sci.med
'OpenGL on the GPU is fast' => comp.graphics


In [41]:
twenty_test = fetch_20newsgroups(
     subset='test', categories=categories,
     shuffle=True, random_state=42)

In [42]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10,10), random_state=1)),
         ])

In [43]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 MLPClassifier(alpha=1e-05, hidden_layer_sizes=(10, 10, 10),
                               random_state=1, solver='lbfgs'))])

In [44]:
predicted = text_clf.predict(twenty_test.data)

In [45]:
import numpy as np
np.mean(predicted == twenty_test.target) 

0.8242343541944075