In [141]:
#aula https://www.youtube.com/watch?v=du_HuGgABtw&feature=youtu.be
#https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn

import sys
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
import numpy as np

In [16]:
# the training data folder must be passed as first argument
movie_reviews_data_folder = r"./data"
dataset = load_files(movie_reviews_data_folder, shuffle=False)
print("n_samples: %d" % len(dataset.data))

n_samples: 2000


In [79]:
#====================================================================
#dataset.data         ## textos das análises 
#dataset.target       ## 0,1 onde: 0 análise negativa, 1 análise positiva
#dataset.target_names ## nome das categorias (pastas que separam textos em neg e pos) 
#====================================================================
print (dataset.target[0]  ,  dataset.target[-1:])    
print (dataset.target_names[0]  ,  dataset.target_names[-1:])


0 [1]
neg ['pos']


In [80]:
#
#
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset.data)


In [81]:
#
#
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)


In [82]:
#
#
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [83]:
#
#
clf = MultinomialNB().fit(X_train_tfidf, dataset.target)


In [88]:
#
#
docs_new = ["great secret , an event which will change both of their lives forever"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)


In [89]:
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, dataset.target_names[category]))


'great secret , an event which will change both of their lives forever' => pos


In [86]:
#====================================================================
#
#  MESMA COISA, MAS AGORA COM Pipeline
#
#
#====================================================================

In [162]:
#====================================================================
#dvamos separar 30% para teste
# X = texto, Y = análise (0,1)
#====================================================================
X_train, X_test, Y_train, Y_test = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=1)

print (X_train[0][-50:]  ,  Y_train[0])    
print (X_test[0][-50:]  ,  Y_test[0])

b'hical user interface when we use our computers ! \n' 1
b'character , but hartman leaves him in the dust . \n' 0


In [100]:
#====================================================================
#  MultinomialNB
#====================================================================

In [91]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])


In [92]:
text_clf.fit(X_train, Y_train) 

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [93]:
predicted = text_clf.predict(X_test)

In [119]:
#
# comparar 2 arrays e obter quais coincidem 
#
print(predicted[-10:])
print(Y_test[-10:])
print(np.mean(predicted[-10:] == Y_test[-10:]))
#
#

np.mean(predicted == Y_test)

[1 0 1 1 1 1 1 1 0 1]
[1 0 0 0 1 1 1 0 0 1]
0.7


0.81

In [103]:
print(metrics.classification_report(Y_test, predicted,
    target_names=dataset.target_names))


              precision    recall  f1-score   support

         neg       0.85      0.75      0.80       299
         pos       0.78      0.87      0.82       301

    accuracy                           0.81       600
   macro avg       0.81      0.81      0.81       600
weighted avg       0.81      0.81      0.81       600



In [118]:
#
# mostra uma matrix com colunas sendo as categorias que quero analisar
# e o número de vezes que foi predito na coluna
#
print( len(Y_test), len(predicted) )
print(predicted[-10:])
print(Y_test[-10:])
#
# queremos ver quantos textos eram pos e foram classificados como pos
# e quantos eram neg e foram classificados como neg
#
#
metrics.confusion_matrix(Y_test, predicted)

600 600
[1 0 1 1 1 1 1 1 0 1]
[1 0 0 0 1 1 1 0 0 1]


array([[224,  75],
       [ 39, 262]])

In [101]:
#====================================================================
#  SGDClassifier
#====================================================================

In [123]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])


In [124]:
text_clf.fit(X_train, Y_train) 

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

In [125]:
predicted = text_clf.predict(X_test)

In [126]:
#
# comparar 2 arrays e obter quais coincidem 
#
print(predicted[-10:])
print(Y_test[-10:])
print(np.mean(predicted[-10:] == Y_test[-10:]))
#
#

np.mean(predicted == Y_test)

[1 0 0 0 1 1 1 1 0 1]
[1 0 0 0 1 1 1 0 0 1]
0.9


0.8366666666666667

In [127]:
print(metrics.classification_report(Y_test, predicted,
    target_names=dataset.target_names))

              precision    recall  f1-score   support

         neg       0.84      0.83      0.84       299
         pos       0.83      0.84      0.84       301

    accuracy                           0.84       600
   macro avg       0.84      0.84      0.84       600
weighted avg       0.84      0.84      0.84       600



In [128]:
#
# mostra uma matrix com colunas sendo as categorias que quero analisar
# e o número de vezes que foi predito na coluna
#
print( len(Y_test), len(predicted) )
print(predicted[-10:])
print(Y_test[-10:])
#
# queremos ver quantos textos eram pos e foram classificados como pos
# e quantos eram neg e foram classificados como neg
#
#
metrics.confusion_matrix(Y_test, predicted)

600 600
[1 0 0 0 1 1 1 1 0 1]
[1 0 0 0 1 1 1 0 0 1]


array([[249,  50],
       [ 48, 253]])

In [122]:
#====================================================================
#  svm.SVC()
#====================================================================

In [155]:
#====================================================================
#dvamos separar 30% para teste
# X = texto, Y = análise
#====================================================================
X_train, X_test, Y_train, Y_test = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=1)

print (X_train[0][-50:]  ,  Y_train[0])    
print (X_test[0][-50:]  ,  Y_test[0])

b'hical user interface when we use our computers ! \n' 1
b'character , but hartman leaves him in the dust . \n' 0


In [156]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.SVC()),
])

In [157]:
print( len(X_train), len(Y_train) )
print(X_train[0][-60:])
print(Y_train[0])

text_clf.fit(X_train, Y_train) 

1400 1400
b' us a graphical user interface when we use our computers ! \n'
1


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,

In [158]:
predicted = text_clf.predict(X_test)

In [159]:
#
# comparar 2 arrays e obter quais coincidem 
#
print(predicted[-10:])
print(Y_test[-10:])
print(np.mean(predicted[-10:] == Y_test[-10:]))
#
#

np.mean(predicted == Y_test)

[1 0 1 0 1 1 1 1 0 1]
[1 0 0 0 1 1 1 0 0 1]
0.8


0.7966666666666666

In [160]:
print(metrics.classification_report(Y_test, predicted,
    target_names=dataset.target_names))

              precision    recall  f1-score   support

         neg       0.79      0.81      0.80       299
         pos       0.81      0.78      0.79       301

    accuracy                           0.80       600
   macro avg       0.80      0.80      0.80       600
weighted avg       0.80      0.80      0.80       600



In [161]:
#
# mostra uma matrix com colunas sendo as categorias que quero analisar
# e o número de vezes que foi predito na coluna
#
print( len(Y_test), len(predicted) )
print(predicted[-10:])
print(Y_test[-10:])
#
# queremos ver quantos textos eram pos e foram classificados como pos
# e quantos eram neg e foram classificados como neg
#
#
metrics.confusion_matrix(Y_test, predicted)

600 600
[1 0 1 0 1 1 1 1 0 1]
[1 0 0 0 1 1 1 0 0 1]


array([[242,  57],
       [ 65, 236]])

In [None]:
#====================================================================
#  GridSearchCV() - encontrar os melhores parâmetros por força bruta
#
# momento da explicacao 
# https://youtu.be/du_HuGgABtw?t=2997
#
#====================================================================

In [168]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False)
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)


In [169]:
gs_clf = gs_clf.fit(X_train, Y_train)

In [172]:
dataset.target_names[gs_clf.predict(['xmen'])[0]]

'neg'

In [174]:
print( gs_clf.best_score_ )                                 

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.8099999999999999
tfidf__use_idf: True
vect__ngram_range: (1, 1)
