# Working With Text Data

The goal of this guide is to explore some of the main scikit-learn tools on a single practical task: analysing a collection of text documents

The tutorial can be found [here](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html).

# Loading data 

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
]

twenty_train = fetch_20newsgroups(
    subset='train', categories=categories, shuffle=True, random_state=42
)

# Pre-processing (Extracting features)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Processamento do texto, tokenização e filtragem de stopwords
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

# Criando a matriz TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Training a classifier

In [30]:
from sklearn.naive_bayes import MultinomialNB

# Criando o modelo
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# Testando o modelo
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


# Performance Evaluation

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import numpy

# Subconjunto de teste
twenty_test = fetch_20newsgroups(
    subset='test', categories=categories, shuffle=True, random_state=42
)

# Criando pipeline com naive bayes
naive_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
naive_clf = naive_clf.fit(twenty_train.data, twenty_train.target)
naive_predicted = naive_clf.predict(twenty_test.data)
print('naive bayes: {}'.format(numpy.mean(naive_predicted == twenty_test.target)))

# Criando pipeline com SVM
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
svm_clf = svm_clf.fit(twenty_train.data, twenty_train.target)
svm_predicted = svm_clf.predict(twenty_test.data)
print('svm: {}'.format(numpy.mean(svm_predicted == twenty_test.target)))

naiva bayes: 0.8348868175765646
svm: 0.9127829560585885


# Metrics Analysis

In [39]:
from sklearn import metrics

print('Naive Bayes')
print(metrics.classification_report(
    twenty_test.target, naive_predicted,
    target_names=twenty_test.target_names
))
print(metrics.confusion_matrix(twenty_test.target, naive_predicted))

print('\nSVM')
print(metrics.classification_report(
    twenty_test.target, svm_predicted,
    target_names=twenty_test.target_names
))
print(metrics.confusion_matrix(twenty_test.target, svm_predicted))

Naive Bayes
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502

[[192   2   6 119]
 [  2 347   4  36]
 [  2  11 322  61]
 [  2   2   1 393]]

SVM
                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502

[[258  11  15  35]
 [  4 379   3   3]
 [  5  33 355   3]
 [  5  10   4 379]]
