Working with text data: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

# Домашнее задание

## Loading Data

In [1]:
import sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

## Lemmatize Data

In [4]:
import spacy
nlp = spacy.load('en')

In [5]:
def lemmatized_text(text):
    lemmas = []
    for tkn in text:
        lemmas.append(tkn.lemma_)
    return ' '.join(lemmas)

In [6]:
def prepare_dataset(data):    
    return [lemmatized_text(nlp(text)) for text in data]
    

In [7]:
twenty_train.data = prepare_dataset(twenty_train.data)

In [8]:
twenty_test.data = prepare_dataset(twenty_test.data)

## Extracting Features

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [11]:
X_train_counts.shape

(2257, 31182)

In [12]:
count_vect.vocabulary_.get(u'algorithm')

4168

In [13]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 31182)

In [14]:
type(X_train_tfidf[0])

scipy.sparse.csr.csr_matrix

In [15]:
X_train_tfidf[0].count_nonzero()

71

## Training a classifier

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)


In [17]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'There is no god']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'There is no god' => soc.religion.christian


## Building a pipeline

In [18]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])


In [19]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Evaluation of the performance on the test set

In [20]:
import numpy as np
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.84221038615179755

In [21]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
        alpha=1e-3, random_state=42,
        max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.90279627163781628

In [22]:
from sklearn.neural_network import MLPClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MLPClassifier()),
])
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.93075898801597867

In [23]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.85      0.90       319
         comp.graphics       0.94      0.97      0.95       389
               sci.med       0.95      0.92      0.93       396
soc.religion.christian       0.88      0.97      0.93       398

           avg / total       0.93      0.93      0.93      1502



In [24]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[271,   3,  11,  34],
       [  4, 376,   4,   5],
       [  2,  18, 364,  12],
       [  3,   4,   4, 387]])

## Выводы

В результате применения лематизации удалось повысить качество только для классификатора MultinomialNB (0.842). Качество работы классификатора SGDClassifier упало до 0.902.
Однако, были исследованы другие модели и установлено, что классификатор MLPClassifier (многослойный перцептрон) имеет самое высокое качество 0.93.