In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, MeanShift

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_curve

import numpy as np

import time

In [2]:
news = fetch_20newsgroups(subset='train')
Text_train = news.data
Y_train = news.target

news_test = fetch_20newsgroups(subset='test')
Text_test = news_test.data
Y_test = news_test.target

processor = CountVectorizer(max_features=5000, stop_words='english')
processor.fit(Text_train)
Sparse_train = processor.transform(Text_train)
Sparse_test = processor.transform(Text_test)

X_train = np.array(Sparse_train.todense())
X_test = np.array(Sparse_test.todense())

labels = news.target_names
vocabulary = processor.vocabulary_

In [3]:
models = {
    'LinearSVC': LinearSVC(random_state=0)
    ,'RandomForestClassifier': RandomForestClassifier(n_estimators=100, random_state=0, max_features=20)
    #,'KNeighborsClassifier3': KNeighborsClassifier(n_neighbors=3)
    ,'GradientBoostingClassifier': GradientBoostingClassifier(max_depth=5, max_features=20)
    ,'GaussianNB': GaussianNB()
}

In [4]:
%%time
classifiers = {}
evaluations = {}
for name, model in models.items():
    print('\nEvaluating model {}'.format(name))
    start = time.time()
    classifier = model.fit(X_train, Y_train)
    end = time.time()
    training_time = end - start
    print("Training time %.3f"%(training_time))
    
    start = time.time()
    score = classifier.score(X_test, Y_test)
    end = time.time()
    scoring_time = end - start
    
    classifiers[name] = classifier
    evaluations[name] = {}
    evaluations[name]['training_time'] = training_time
    evaluations[name]['scoring_time'] = training_time
    evaluations[name]['score'] = score
    
    print("Test set score for {}: {:.2f}".format(name, score))
    print("Scoring time %.3f s"%(scoring_time))


Evaluating model LinearSVC
Training time 3.639
Test set score for LinearSVC: 0.73
Scoring time 0.157 s

Evaluating model RandomForestClassifier
Training time 24.733
Test set score for RandomForestClassifier: 0.77
Scoring time 0.995 s

Evaluating model GradientBoostingClassifier
Training time 67.213
Test set score for GradientBoostingClassifier: 0.73
Scoring time 1.466 s

Evaluating model GaussianNB
Training time 0.990
Test set score for GaussianNB: 0.60
Scoring time 8.221 s
Wall time: 1min 47s
