## Sample pipeline for text feature extraction
Source: http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html

In [1]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s %(levelname)s %(message)s')

In [4]:
categories = ['alt.atheism', 'talk.religion.misc']
data = fetch_20newsgroups(subset='train', categories=categories)
print('%d documents' % len(data.filenames))
print('%d categories' % len(data.target_names))
print()

857 documents
2 categories



In [10]:
pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier())
    ])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'clf__alpha': (0.00001, 0.00002),
    #'clf__penalty' : ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80)
}

In [12]:
grid_search = GridSearchCV(pipeline, parameters, verbose=1, scoring='recall')
print("Performing grid search...")
print("pipeline", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   49.2s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  1.2min finished


Performing grid search...
pipeline ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
done in 72.515s

Best score: 0.942
Best parameters set:


In [13]:
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf: SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
	clf__alpha: 0.0001
	clf__average: False
	clf__class_weight: None
	clf__epsilon: 0.1
	clf__eta0: 0.0
	clf__fit_intercept: True
	clf__l1_ratio: 0.15
	clf__learning_rate: 'optimal'
	clf__loss: 'hinge'
	clf__n_iter: 5
	clf__n_jobs: 1
	clf__penalty: 'l2'
	clf__power_t: 0.5
	clf__random_state: None
	clf__shuffle: True
	clf__verbose: 0
	clf__warm_start: False
	steps: [('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\