In [163]:
#
#
#
#
# MultinomialNB = 0.788
# svm.SVC       = 0.796
# SGDClassifier = 0.838
#
#  Apesar do usar GridSearchCV os parâmetros não trouxeram bons resultados, os parâmetros default foram os que deram melhor resultados.
#  Um pouco mais de pesquisa certamente traria melhores resultados.
#

In [1]:
#
#  Onde passa um boi, passa a boiada.
#  Então vamos carregar tudo mesmo.
#
import sys
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
import numpy as np

In [2]:
#
#  carregando dados
#
movie_reviews_data_folder = r"./data"
dataset = load_files(movie_reviews_data_folder, shuffle=False)
print("n_samples: %d" % len(dataset.data))

n_samples: 2000


In [3]:
#
#  deixar 30% para teste
#
X_train, X_test, Y_train, Y_test = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=1)

print (X_train[0][-50:]  ,  Y_train[0])    
print (X_test[0][-50:]  ,  Y_test[0])

b'hical user interface when we use our computers ! \n' 1
b'character , but hartman leaves him in the dust . \n' 0


In [4]:
#
#  Nosso pipeline
#
filmes_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])


In [5]:
#
# roda o baú
#
filmes_clf.fit(X_train, Y_train) 

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [6]:
#
# 
#
predicted = filmes_clf.predict(X_test)

In [7]:
#
# margem de erro do ibope
#
np.mean(predicted == Y_test)

0.755

In [8]:
#
#
print(metrics.classification_report(Y_test, predicted,
    target_names=dataset.target_names))

              precision    recall  f1-score   support

         neg       0.71      0.87      0.78       299
         pos       0.83      0.64      0.72       301

    accuracy                           0.76       600
   macro avg       0.77      0.76      0.75       600
weighted avg       0.77      0.76      0.75       600



In [9]:
#
#
metrics.confusion_matrix(Y_test, predicted)

array([[261,  38],
       [109, 192]])

In [None]:
#====================================================================
#
#
#  GridSearchCV() 
#
#
#====================================================================

In [71]:
#
#  vamos olhar as opcoes de parâmetros
#  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
#  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
#
#
#
#
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'vect__stop_words':('None','english'),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0)
}
gs_clf = GridSearchCV(filmes_clf, parameters, n_jobs=-1)

In [72]:
gs_clf = gs_clf.fit(X_train, Y_train)

In [73]:
#
#
predicted = filmes_clf.predict(X_test)

In [74]:
print( gs_clf.best_score_ )                                 

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.6542857142857144
clf__alpha: 0.9
tfidf__use_idf: True
vect__ngram_range: (1, 2)
vect__stop_words: 'english'


In [None]:
#
#
#  VAMOS RODAR MultinomialNB COM OS NOVOS PARÂMETROS
#
#

In [83]:
filmes_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2),stop_words='english')),
                     ('tfidf', TfidfTransformer(use_idf=False)), 
                     ('clf', MultinomialNB(alpha=0.1))]) 

In [84]:
filmes_clf.fit(X_train, Y_train)

In [85]:
predicted = filmes_clf.predict(X_test)

In [86]:
#
# 3% de melhora é melhor que nada
#
np.mean(predicted == Y_test)

0.7883333333333333

In [87]:
#
#
metrics.confusion_matrix(Y_test, predicted)

array([[246,  53],
       [ 74, 227]])

In [None]:
#
#
#  svm.SVC()
#
#

In [157]:
filmes_clf = Pipeline([('vect', CountVectorizer()),  #ngram_range=(1, 2),stop_words='english'
                     ('tfidf', TfidfTransformer()),  #use_idf=False
                     ('clf', svm.SVC())]) 

In [158]:
filmes_clf.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,

In [159]:
predicted = filmes_clf.predict(X_test)

In [160]:
np.mean(predicted == Y_test)

0.7966666666666666

In [161]:
metrics.confusion_matrix(Y_test, predicted)

array([[242,  57],
       [ 65, 236]])

In [None]:
#
#
#  SGDClassifier()
#
#

In [58]:
filmes_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2),stop_words='english')),
                     ('tfidf', TfidfTransformer(use_idf=False)), 
                     ('clf', SGDClassifier())]) 

In [59]:
filmes_clf.fit(X_train, Y_train)

In [60]:
predicted = filmes_clf.predict(X_test)

In [61]:
#
#  Bom resultado, este morece uma olhada nos parâmetros
#
np.mean(predicted == Y_test)

0.8183333333333334

In [62]:
metrics.confusion_matrix(Y_test, predicted)

array([[246,  53],
       [ 56, 245]])

In [None]:
#====================================================================
#
#
#  GridSearchCV() - vamos otimizar SGDClassifier
#
#
#====================================================================

In [67]:
#
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

from sklearn.model_selection import GridSearchCV
parameters = {  
              'clf__loss': ('squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive','hinge', 'log','modified_huber','squared_hinge','perceptron'),
              'clf__penalty':('l2','l1','elasticnet'),
              'clf__alpha': (0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0),
              'clf__shuffle': (True, False)
             }
gs_clf = GridSearchCV(filmes_clf, parameters, n_jobs=-1)

In [68]:
gs_clf = gs_clf.fit(X_train, Y_train)

In [69]:
#
#
predicted = filmes_clf.predict(X_test)

In [70]:
print( gs_clf.best_score_ )                                 

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.7935714285714285
clf__alpha: 0.1
clf__loss: 'perceptron'
clf__penalty: 'l2'
clf__shuffle: False


In [None]:
#
#
#  SGDClassifier() - custom
#
#

In [149]:
filmes_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), 
                     ('clf', SGDClassifier(shuffle=False,penalty='l2',loss='perceptron',alpha=0.1))]) 

In [150]:
filmes_clf.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.1, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='perceptron',
                               max_iter=100

In [151]:
predicted = filmes_clf.predict(X_test)

In [152]:
np.mean(predicted == Y_test)
#
#  nao ficou tão bom assim, vamos cavar mais.
#

0.7683333333333333

In [153]:
filmes_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), 
                     ('clf', SGDClassifier())]) 

In [154]:
filmes_clf.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

In [155]:
predicted = filmes_clf.predict(X_test)

In [156]:
np.mean(predicted == Y_test)
#
#
# bom é isso aew, o default ainda está melhor.
#
#

0.8383333333333334