In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from IPython.display import display
import plotly_express as px
import plotly.io as pio
pio.renderers.default = 'colab'

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from preprocessing import stopwords, preprocess
from datasets import fake_br_corpus

[nltk_data] Downloading package stopwords to /home/aldo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
dataset = fake_br_corpus.loadTrain(True)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import log_loss

pipeline = Pipeline([
    (
      'vect',
      TfidfVectorizer(
        stop_words=stopwords,
        preprocessor=preprocess,
        max_features=1000,
      ),
    ), 
    ('clf', SVC(probability=True)),
])
parameters = [        
    {
        # 'vect__max_df': (0.5, 0.75, 1.0),
        'vect__ngram_range': ((1,3), (1,5)),
        'vect__max_features': (1000, 10000),
        'clf': (SVC(),),
        'clf__C': (1, 0.75, 0.5)
    },
    {
        # 'vect__max_df': (0.5, 0.75, 1.0),
        'vect__ngram_range': ((1,3), (1,5)),
        'vect__max_features': (1000, 10000),
        'clf': (MultinomialNB(),),
    },
    {
        # 'vect__max_df': (0.5, 0.75, 1.0),
        'vect__ngram_range': ((1,3), (1,5)),
        'vect__max_features': (1000, 10000),
        'clf': (DecisionTreeClassifier(),),
    },
    {
        # 'vect__max_df': (0.5, 0.75, 1.0),
        'vect__ngram_range': ((1,3), (1,5)),
        'vect__max_features': (1000, 10000),
        'clf': (RandomForestClassifier(),),
    },
    {
        # 'vect__max_df': (0.5, 0.75, 1.0),
        'vect__ngram_range': ((1,3), (1,5)),
        'vect__max_features': (1000, 10000),
        'clf': (KNeighborsClassifier(),),
    },
]
grid_search = GridSearchCV(pipeline, parameters, verbose=10, scoring='accuracy', n_jobs=3)

In [None]:
result = grid_search.fit(dataset.text, dataset.label)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   21.5s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  5.0min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  6.8min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  8.6min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed: 10.8min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed: 12.2min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed: 13.3min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed: 14.2min
[Parallel(n_jobs=3)]: Done 107 tasks      | elapsed: 15.5min
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed: 17.5min
[Parallel(n_jobs=3)]: Done 140 out of 140 | elapsed: 18.7min finished


In [None]:
resultsDf = pd.DataFrame(result.cv_results_)
resultsDf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__C,param_vect__max_features,param_vect__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,15.750276,1.859881,2.547583,0.350043,SVC(C=1),1.0,1000,"(1, 3)","{'clf': SVC(C=1), 'clf__C': 1, 'vect__max_feat...",0.870386,0.902379,0.917966,0.908942,0.845649,0.889064,0.026987,7
1,23.064486,2.173444,3.019394,0.553674,SVC(C=1),1.0,1000,"(1, 5)","{'clf': SVC(C=1), 'clf__C': 1, 'vect__max_feat...",0.870386,0.902379,0.917145,0.909762,0.841544,0.888243,0.028277,8
2,36.288483,3.158848,6.98483,2.044355,SVC(C=1),1.0,10000,"(1, 3)","{'clf': SVC(C=1), 'clf__C': 1, 'vect__max_feat...",0.887613,0.931091,0.936833,0.915505,0.868637,0.907936,0.026023,1
3,45.410075,9.272818,5.71904,1.478955,SVC(C=1),1.0,10000,"(1, 5)","{'clf': SVC(C=1), 'clf__C': 1, 'vect__max_feat...",0.888433,0.930271,0.937654,0.913864,0.867816,0.907608,0.026102,2
4,19.307399,1.3674,3.207955,1.460837,SVC(C=1),0.75,1000,"(1, 3)","{'clf': SVC(C=1), 'clf__C': 0.75, 'vect__max_f...",0.870386,0.896637,0.917145,0.907301,0.838259,0.885946,0.028495,9
5,30.81799,6.745042,3.792796,0.923326,SVC(C=1),0.75,1000,"(1, 5)","{'clf': SVC(C=1), 'clf__C': 0.75, 'vect__max_f...",0.868745,0.900738,0.916325,0.90566,0.834975,0.885289,0.029739,10
6,31.894899,2.615578,6.585361,1.65335,SVC(C=1),0.75,10000,"(1, 3)","{'clf': SVC(C=1), 'clf__C': 0.75, 'vect__max_f...",0.887613,0.926169,0.935193,0.910582,0.867816,0.905475,0.024798,3
7,38.650518,3.990777,6.222081,0.905557,SVC(C=1),0.75,10000,"(1, 5)","{'clf': SVC(C=1), 'clf__C': 0.75, 'vect__max_f...",0.887613,0.922067,0.935193,0.909762,0.865353,0.903998,0.024879,4
8,21.618542,2.223022,3.675477,0.464439,SVC(C=1),0.5,1000,"(1, 3)","{'clf': SVC(C=1), 'clf__C': 0.5, 'vect__max_fe...",0.873667,0.897457,0.913043,0.894996,0.836617,0.883156,0.026434,11
9,23.514422,3.64467,3.124265,0.964188,SVC(C=1),0.5,1000,"(1, 5)","{'clf': SVC(C=1), 'clf__C': 0.5, 'vect__max_fe...",0.873667,0.899098,0.912223,0.895816,0.833333,0.882827,0.027681,12


In [None]:
from datetime import datetime
filename = f'gridsearch-results-{datetime.isoformat(datetime.now())}.csv'
resultsDf.to_csv(filename, index=False)

In [None]:
result.best_estimator_

Pipeline(steps=[('vect',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 3),
                                 preprocessor=<function preprocess at 0x7f6ffcde9f80>,
                                 stop_words=['de', 'a', 'o', 'que', 'e', 'é',
                                             'do', 'da', 'em', 'um', 'para',
                                             'com', 'não', 'uma', 'os', 'no',
                                             'se', 'na', 'por', 'mais', 'as',
                                             'dos', 'como', 'mas', 'ao', 'ele',
                                             'das', 'à', 'seu', 'sua', ...])),
                ('clf', SVC(C=1))])

In [None]:
[x for x in result.best_estimator_.steps[0][1].get_feature_names() if len(x) < 3]

['ac',
 'ah',
 'ai',
 'al',
 'am',
 'ap',
 'ar',
 'aí',
 'ba',
 'bh',
 'bi',
 'br',
 'ce',
 'cá',
 'df',
 'di',
 'dj',
 'dr',
 'dá',
 'ei',
 'el',
 'es',
 'ex',
 'fi',
 'fm',
 'fã',
 'fé',
 'go',
 'ho',
 'hs',
 'ia',
 'ii',
 'il',
 'in',
 'ir',
 'jn',
 'jr',
 'jô',
 'kg',
 'km',
 'la',
 'le',
 'lo',
 'lá',
 'ma',
 'mc',
 'mg',
 'mi',
 'mp',
 'mr',
 'ms',
 'mt',
 'má',
 'nu',
 'né',
 'of',
 'oi',
 'on',
 'pa',
 'pb',
 'pe',
 'pf',
 'pi',
 'pl',
 'pm',
 'pp',
 'pr',
 'pt',
 'pv',
 'pé',
 'pó',
 'ri',
 'rj',
 'rn',
 'ro',
 'rr',
 'rs',
 'ré',
 'sc',
 'sd',
 'si',
 'sp',
 'sr',
 'tj',
 'to',
 'tv',
 'tá',
 'tô',
 'un',
 'up',
 'us',
 'vi',
 'vá',
 'vê',
 'xi',
 'zé']