In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV



categories = ['comp.sys.mac.hardware', 'rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test  = fetch_20newsgroups(subset='test',  categories=categories, remove=('headers', 'footers', 'quotes'))


from sklearn.pipeline import Pipeline
classifier = Pipeline([ ('vect', CountVectorizer(stop_words='english')),
                        ('tfidf', TfidfTransformer()),
                        ('knn', KNeighborsClassifier())
                        ])


parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'knn__n_neighbors': list(range(100,300))
             }

gs_classifier = GridSearchCV(classifier, parameters, n_jobs=-1)
gs_classifier = gs_classifier.fit(newsgroups_train.data, newsgroups_train.target)

print(gs_classifier.best_score_)
print(gs_classifier.best_params_)

# 0.9567062818336163
# {'knn__n_neighbors': 107, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
# 0.9558573853989814
# {'knn__n_neighbors': 205, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

