In [3]:
%pylab inline
import IPython
import sklearn as sk
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


In [None]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')

In [14]:
print(news.keys())
print(news['target_names'])
print(news['filenames'].shape)
print(news['target'].shape)

dict_keys(['filenames', 'target_names', 'description', 'data', 'target', 'DESCR'])
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
(18846,)
(18846,)


In [19]:
SPLIT_PERC = 0.75
split_size = int(len(news.data)*SPLIT_PERC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]

In [20]:
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print (scores)
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores)))

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
clf_2 = Pipeline([
    ('vect', HashingVectorizer(non_negative=True)),
    ('clf', MultinomialNB()),
])
clf_3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

In [23]:
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
    evaluate_cross_validation(clf, news.data, news.target, 5)

[ 0.85782493  0.85725657  0.84664367  0.85911382  0.8458477 ]
Mean score: 0.853 (+/-0.003)
[ 0.75543767  0.77659857  0.77049615  0.78508888  0.76200584]
Mean score: 0.770 (+/-0.005)
[ 0.84482759  0.85990979  0.84558238  0.85990979  0.84213319]
Mean score: 0.850 (+/-0.004)


In [26]:
clf_4 = Pipeline([
    ('vect', TfidfVectorizer(
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB()),
])

In [28]:
evaluate_cross_validation(clf_4, news.data, news.target, 5)


[ 0.86100796  0.8718493   0.86203237  0.87291059  0.8588485 ]
Mean score: 0.865 (+/-0.003)


In [33]:
def get_stop_words():
    result = set()
    for line in open('./data/stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

In [35]:
stop_words = get_stop_words()
print (stop_words)

{'sometime', 'below', 'elsewhere', 'found', 'seem', 'never', 'thereby', 'own', 'along', 'there', 'somehow', 'nothing', 'would', 'themselves', 'on', 'such', 'hasnt', 'describe', 'thereupon', 'whatever', 'whether', 'seems', 'within', 'where', 'cant', 'thick', 'his', 'although', 'could', 'ie', 'however', 'throughout', 'many', 'get', 'sincere', 'thence', 'well', 'their', 'of', 'very', 'hereupon', 'but', 'might', 'per', 'indeed', 'show', 'thus', 'everything', 'against', 'he', 'into', 'other', 'to', 'front', 'everywhere', 'whence', 'more', 'those', 'most', 'will', 'both', 'whereby', 'either', 'myself', 'therein', 'these', 'thru', 'toward', 'move', 'two', 'a', 'cannot', 'every', 'be', 'become', 'ltd', 'rather', 'anywhere', 'us', 'few', 'wherever', 'except', 'hereafter', 'here', 'becoming', 'it', 'until', 'moreover', 'they', 'put', 'forty', 'see', 'nor', 'through', 'whereafter', 'everyone', 'detail', 'onto', 'etc', 'therefore', 'any', 'not', 'them', 'anything', 'mine', 'find', 'much', 'some', 

In [37]:
clf_5 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",    
    )),
    ('clf', MultinomialNB()),
])

In [39]:
evaluate_cross_validation(clf_5, news.data, news.target, 5)


[ 0.88116711  0.89519767  0.88325816  0.89227912  0.88113558]
Mean score: 0.887 (+/-0.003)


In [41]:
clf_7 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('clf', MultinomialNB(alpha=0.01)),
])

In [42]:
evaluate_cross_validation(clf_7, news.data, news.target, 5)


[ 0.9204244   0.91960732  0.91828071  0.92677103  0.91854603]
Mean score: 0.921 (+/-0.002)


In [44]:
from sklearn import metrics

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    
    clf.fit(X_train, y_train)
    
    print ("Accuracy on training set:")
    print (clf.score(X_train, y_train))
    print ("Accuracy on testing set:")
    print (clf.score(X_test, y_test))
    
    y_pred = clf.predict(X_test)
    
    print ("Classification Report:")
    print (metrics.classification_report(y_test, y_pred))
    print ("Confusion Matrix:")
    print (metrics.confusion_matrix(y_test, y_pred))

In [47]:
train_and_evaluate(clf_7, X_train, X_test, y_train, y_test)


Accuracy on training set:
0.996957690675
Accuracy on testing set:
0.917869269949
Classification Report:
             precision    recall  f1-score   support

          0       0.95      0.88      0.91       216
          1       0.85      0.85      0.85       246
          2       0.91      0.84      0.87       274
          3       0.81      0.86      0.83       235
          4       0.88      0.90      0.89       231
          5       0.89      0.91      0.90       225
          6       0.88      0.80      0.84       248
          7       0.92      0.93      0.93       275
          8       0.96      0.98      0.97       226
          9       0.97      0.94      0.96       250
         10       0.97      1.00      0.98       257
         11       0.97      0.97      0.97       261
         12       0.90      0.91      0.91       216
         13       0.94      0.95      0.95       257
         14       0.94      0.97      0.95       246
         15       0.90      0.96      0.93     

In [51]:
clf_7.named_steps['vect'].get_feature_names()[100:110]

['01_introduction.ma',
 '01apr93.17160985.0059',
 '01c8',
 '01f6',
 '01h0',
 '01ll',
 '01ne',
 '01ob',
 '01vl2',
 '01ya']

In [52]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [55]:
clf_8 = Pipeline([
    ('vect', TfidfVectorizer(
                tokenizer=LemmaTokenizer(), # Stemming 을 위한 WordNet 기반 Tokenlizer 추가
                stop_words=stop_words,
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('clf', MultinomialNB(alpha=0.01)),
])

evaluate_cross_validation(clf_7, news.data, news.target, 5)
evaluate_cross_validation(clf_8, news.data, news.target, 5)

[ 0.9204244   0.91960732  0.91828071  0.92677103  0.91854603]
Mean score: 0.921 (+/-0.002)
[ 0.91352785  0.91615813  0.91509684  0.92464845  0.91615813]
Mean score: 0.917 (+/-0.002)


In [56]:
train_and_evaluate(clf_8, X_train, X_test, y_train, y_test)

Accuracy on training set:
0.99596717136
Accuracy on testing set:
0.911926994907
Classification Report:
             precision    recall  f1-score   support

          0       0.94      0.88      0.91       216
          1       0.84      0.82      0.83       246
          2       0.89      0.84      0.87       274
          3       0.81      0.85      0.83       235
          4       0.89      0.88      0.88       231
          5       0.87      0.92      0.90       225
          6       0.87      0.80      0.83       248
          7       0.92      0.91      0.92       275
          8       0.94      0.97      0.96       226
          9       0.96      0.94      0.95       250
         10       0.96      1.00      0.98       257
         11       0.96      0.98      0.97       261
         12       0.90      0.92      0.91       216
         13       0.94      0.95      0.94       257
         14       0.95      0.95      0.95       246
         15       0.90      0.96      0.93      