In [30]:
from sklearn.datasets import fetch_20newsgroups
train_news = fetch_20newsgroups(subset="train")
test_news = fetch_20newsgroups(subset="test")

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# 使用朴素贝叶斯方法分类
text_clf = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB()),])

In [14]:
text_clf = text_clf.fit(train_news.data, train_news.target)
predict = text_clf.predict(test_news.data)
import numpy as np
print(np.mean(predict==test_news.target))

0.77389803505


In [31]:
# 使用线性核的SVM，同样通过pipeline过程
from sklearn.linear_model import SGDClassifier
text_clf2 = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier())])
text_clf2 = text_clf2.fit(train_news.data, train_news.target)
predict = text_clf2.predict(test_news.data)
print(np.mean(predict==test_news.target))



0.852894317578


In [42]:
# 使用网格搜索调整svm分类器的超参数
from sklearn.grid_search import GridSearchCV
parameters = {
    'vect__ngram_range':[(1,1),(1,2)],
    'tfidf__use_idf':(True,False),
    'clf__alpha':(1e-2,1e-3)
}
gs_clf = GridSearchCV(text_clf2,parameters,n_jobs=-1)
gs_clf = gs_clf.fit(train_news.data, train_news.target)

In [43]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.8986211773024572
{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
