
<br>
==========================================================<br>
Sample pipeline for text feature extraction and evaluation<br>
==========================================================<br>
The dataset used in this example is the 20 newsgroups dataset which will be<br>
automatically downloaded and then cached and reused for the document<br>
classification example.<br>
You can adjust the number of categories by giving their names to the dataset<br>
loader or setting them to None to get the 20 of them.<br>
Here is a sample output of a run on a quad-core machine::<br>
  Loading 20 newsgroups dataset for categories:<br>
  ['alt.atheism', 'talk.religion.misc']<br>
  1427 documents<br>
  2 categories<br>
  Performing grid search...<br>
  pipeline: ['vect', 'tfidf', 'clf']<br>
  parameters:<br>
  {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),<br>
   'clf__max_iter': (10, 50, 80),<br>
   'clf__penalty': ('l2', 'elasticnet'),<br>
   'tfidf__use_idf': (True, False),<br>
   'vect__max_n': (1, 2),<br>
   'vect__max_df': (0.5, 0.75, 1.0),<br>
   'vect__max_features': (None, 5000, 10000, 50000)}<br>
  done in 1737.030s<br>
  Best score: 0.940<br>
  Best parameters set:<br>
      clf__alpha: 9.9999999999999995e-07<br>
      clf__max_iter: 50<br>
      clf__penalty: 'elasticnet'<br>
      tfidf__use_idf: True<br>
      vect__max_n: 2<br>
      vect__max_df: 0.75<br>
      vect__max_features: 50000<br>


Author: Olivier Grisel <olivier.grisel@ensta.org><br>
        Peter Prettenhofer <peter.prettenhofer@gmail.com><br>
        Mathieu Blondel <mathieu@mblondel.org><br>
License: BSD 3 clause

In [None]:
from pprint import pprint
from time import time
import logging

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
print(__doc__)

Display progress logs on stdout

In [None]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

#############################################################################<br>
Load some categories from the training set

In [None]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

In [None]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

In [None]:
data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

#############################################################################<br>
Define a pipeline combining a text feature extractor with a simple<br>
classifier

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

uncommenting more parameters will give better exploring power but will<br>
increase processing time in a combinatorial way

In [None]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

In [None]:
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))