In [65]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

In [66]:
# Load the list of files matching those categories as follows:

In [67]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(
                  subset='train', categories=categories,
                  shuffle=True, random_state=42)

In [68]:
# Did we get what we wanted?
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [69]:
# Size
len(twenty_train.data)

2257

In [70]:
# Lets see what we have :
print("\n".join(twenty_train.data[0].split("\n")[:2]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?


In [71]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [72]:
print("\n".join(twenty_train.data[1000].split("\n")[:2]))

From: bobs@thnext.mit.edu (Robert Singleton)
Subject: Re: Americans and Evolution


In [73]:
print(twenty_train.target_names[twenty_train.target[1000]])

alt.atheism


In [74]:
#scikit-learn provides basic tools to process text using the Bag of Words representation.

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
# Punctuation and single letter words will be automatically removed.

In [77]:
count_vect = CountVectorizer(stop_words='english')

In [78]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [79]:
 X_train_counts.shape

(2257, 35482)

In [80]:
#Occurrence count is a good start but there is an issue: 
#longer documents will have higher average count values than shorter documents, 
#even though they might talk about the same topics.

#To avoid these potential discrepancies it suffices to divide the number 
#of occurrences of each word in a document by the total number of words in the document: 
#these new features are called “tf” for Term Frequencies.

#Another refinement on top of tf is to downscale weights for words 
#that occur in many documents in the corpus and are therefore less 
#informative than those that occur only in a smaller portion of the corpus.

#This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

In [81]:
from sklearn.feature_extraction.text import TfidfTransformer

In [82]:
tfidf_transformer = TfidfTransformer()

In [83]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [56]:
# Import the SKlearn model we are using
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                      hidden_layer_sizes=(10, 10,10), random_state=1)
clf.fit(X_train_tfidf, twenty_train.target)

In [57]:
docs_new = ['Doctors are bad', 'OpenGL on the GPU is fast']

In [58]:
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [59]:
predicted = clf.predict(X_new_tfidf)

In [60]:
for doc, category in zip(docs_new, predicted):
   print('%r => %s' % (doc, twenty_train.target_names[category]))

'Doctors are bad' => sci.med
'OpenGL on the GPU is fast' => comp.graphics


In [61]:
twenty_test = fetch_20newsgroups(
     subset='test', categories=categories,
     shuffle=True, random_state=42)

In [62]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10,10), random_state=1)),
         ])

In [63]:
text_clf.fit(twenty_train.data, twenty_train.target)

In [64]:
predicted = text_clf.predict(twenty_test.data)

In [36]:
import numpy as np
np.mean(predicted == twenty_test.target) 

0.8202396804260985