In [5]:
%load_ext autoreload
%autoreload ,2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from IPython.display import display
import plotly_express as px
import plotly.io as pio
pio.renderers.default = 'colab'

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
from preprocessing import stopwords, preprocess
from datasets import fake_br_corpus

In [8]:
dataset = fake_br_corpus.loadTrain(True)

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import log_loss

pipeline = Pipeline([
    (
      'vect',
      TfidfVectorizer(
        stop_words=stopwords,
        preprocessor=preprocess,
        strip_accents=True,
      ),
    ), 
    ('clf', XGBClassifier()),
])

vectParameters = {
  'vect__ngram_range': ((1,3), (1,5), (1,7)),
  'vect__max_features': (1000, 5000, 10000, 20000),
  # 'vect__max_df': (0.5, 0.75, 1.0),
}

parameters = [        
    {
        **vectParameters,
        'clf': (XGBClassifier(),),
    },
    {
        **vectParameters,
        'clf': (SVC(probability=True),),
        'clf__C': (1, 0.75)
    },
    {
        **vectParameters,
        'clf': (RandomForestClassifier(),),
    },
]
grid_search = GridSearchCV(pipeline, parameters, verbose=10, scoring='accuracy', n_jobs=3)

In [12]:
result = grid_search.fit(dataset.text, dataset.label)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [13]:
resultsDf = pd.DataFrame(result.cv_results_)
resultsDf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_vect__max_features,param_vect__ngram_range,param_clf__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,184.077742,11.026473,2.468164,1.494893,"XGBClassifier(base_score=None, booster=None, c...",1000,"(1, 3)",,"{'clf': XGBClassifier(base_score=None, booster...",0.845775,0.894996,0.908121,0.889253,0.836617,0.874953,0.028381,34
1,181.395806,25.52542,3.734855,3.228048,"XGBClassifier(base_score=None, booster=None, c...",1000,"(1, 5)",,"{'clf': XGBClassifier(base_score=None, booster...",0.844955,0.893355,0.910582,0.871206,0.821839,0.868388,0.032024,36
2,201.443858,10.317236,3.795097,2.01713,"XGBClassifier(base_score=None, booster=None, c...",1000,"(1, 7)",,"{'clf': XGBClassifier(base_score=None, booster...",0.853979,0.891715,0.902379,0.886792,0.823481,0.871669,0.029013,35
3,176.109461,12.137663,3.102641,1.932204,"XGBClassifier(base_score=None, booster=None, c...",5000,"(1, 3)",,"{'clf': XGBClassifier(base_score=None, booster...",0.849877,0.902379,0.910582,0.88105,0.841544,0.877086,0.027499,33
4,197.473554,20.229678,3.481816,0.89806,"XGBClassifier(base_score=None, booster=None, c...",5000,"(1, 5)",,"{'clf': XGBClassifier(base_score=None, booster...",0.855619,0.893355,0.915505,0.88187,0.856322,0.880534,0.022786,25
5,189.237339,13.512022,2.485047,1.423756,"XGBClassifier(base_score=None, booster=None, c...",5000,"(1, 7)",,"{'clf': XGBClassifier(base_score=None, booster...",0.858901,0.899098,0.912223,0.878589,0.847291,0.87922,0.024154,29
6,161.745294,7.166898,1.452008,0.595786,"XGBClassifier(base_score=None, booster=None, c...",10000,"(1, 3)",,"{'clf': XGBClassifier(base_score=None, booster...",0.853158,0.894996,0.908942,0.88105,0.848112,0.877252,0.023508,32
7,166.074143,8.720212,1.686515,0.667719,"XGBClassifier(base_score=None, booster=None, c...",10000,"(1, 5)",,"{'clf': XGBClassifier(base_score=None, booster...",0.85726,0.894176,0.909762,0.888433,0.844828,0.878892,0.024108,30
8,174.487346,4.504672,2.612982,1.487522,"XGBClassifier(base_score=None, booster=None, c...",10000,"(1, 7)",,"{'clf': XGBClassifier(base_score=None, booster...",0.860541,0.890074,0.918786,0.886792,0.841544,0.879547,0.02649,28
9,160.616892,9.685654,1.605494,0.868927,"XGBClassifier(base_score=None, booster=None, c...",20000,"(1, 3)",,"{'clf': XGBClassifier(base_score=None, booster...",0.848236,0.899098,0.915505,0.882691,0.847291,0.878564,0.027207,31


In [14]:
from datetime import datetime
filename = f'gridsearch-results-{datetime.isoformat(datetime.now())}.csv'
resultsDf.to_csv(filename, index=False)

In [15]:
result.best_estimator_

Pipeline(steps=[('vect',
                 TfidfVectorizer(max_features=20000, ngram_range=(1, 3),
                                 preprocessor=<function preprocess at 0x7f3b6f6234d0>,
                                 stop_words=['de', 'a', 'o', 'que', 'e', 'é',
                                             'do', 'da', 'em', 'um', 'para',
                                             'com', 'não', 'uma', 'os', 'no',
                                             'se', 'na', 'por', 'mais', 'as',
                                             'dos', 'como', 'mas', 'ao', 'ele',
                                             'das', 'à', 'seu', 'sua', ...],
                                 strip_accents=True)),
                ('clf', SVC(C=1, probability=True))])

In [21]:
len(result.best_estimator_.steps[0][1].get_feature_names())

20000

In [None]:
xgb = XGBClassifier()