In [1]:
# PIVIC - "Um modelo computacional para identificação de notícias falsas sobre a Covid-19 no Brasil"
# Code: Machine Learning - Supervised Learning
# Author: Anísio Pereira Batista Filho

In [2]:
##Essentials
import os
import csv
import numpy as np ##Numpy
import pandas as pd ##Pandas

##Sci-kit Learn
###Machine learning algorithms
from xgboost import XGBClassifier, XGBRegressor
from imblearn.pipeline import Pipeline as imblearnPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

##Model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

###Pipeline, vectorizers and preprocessing
from sklearn.pipeline import Pipeline as sklearnPipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

###Metrics
from sklearn.metrics import classification_report, f1_score

##Utils
import re
import unicodedata
from tqdm.auto import tqdm
import time
import timeit

In [3]:
pd.set_option("display.max_columns", None)

In [4]:
tqdm.pandas()

## Configuração

### Classficador

In [5]:
classificador = 'randomforest'

### Oversampling

In [6]:
oversampling = True

### Undersampling

In [7]:
undersampling = True

## Abertura de arquivo e criação do dataframe

In [8]:
df = pd.read_csv('data/corpus_labeled/iguais/bases_tcc/03_geracao_carcteristicas_base.csv', sep=",", low_memory=False)

In [9]:
df.columns

Index(['tweet_id', 'created_at', 'user_location', 'tweet_text', 'label_A',
       'label_B', 'pct_certainty_A', 'pct_certainty_B', 'tweet_text_preproc',
       'tweet_text_stemming', 'tweet_text_lemmatization', 'time_shift',
       'user_screen_name', 'user_id', 'location_treated', 'state_location',
       'region_location', 'tweet_text_stemming_char_len',
       'tweet_text_stemming_word_len', 'tweet_text_lemmatization_char_len',
       'tweet_text_lemmatization_word_len', 'tweet_text_stemming_noun',
       'tweet_text_stemming_adj', 'tweet_text_stemming_verb',
       'tweet_text_stemming_adv', 'tweet_text_lemmatization_noun',
       'tweet_text_lemmatization_adj', 'tweet_text_lemmatization_verb',
       'tweet_text_lemmatization_adv'],
      dtype='object')

In [10]:
X = pd.DataFrame()
X = df[[    'tweet_text_stemming', 'tweet_text_lemmatization', 
            
            'time_shift', 'location_treated', 'state_location', 'region_location', 
            
            'tweet_text_stemming_char_len', 'tweet_text_stemming_word_len', 
            'tweet_text_stemming_noun', 'tweet_text_stemming_adj', 
            'tweet_text_stemming_verb', 'tweet_text_stemming_adv', 

            'tweet_text_lemmatization_char_len', 'tweet_text_lemmatization_word_len',
            'tweet_text_lemmatization_noun', 'tweet_text_lemmatization_adj', 
            'tweet_text_lemmatization_verb', 'tweet_text_lemmatization_adv'
       ]]

In [11]:
X

Unnamed: 0,tweet_text_stemming,tweet_text_lemmatization,time_shift,location_treated,state_location,region_location,tweet_text_stemming_char_len,tweet_text_stemming_word_len,tweet_text_stemming_noun,tweet_text_stemming_adj,tweet_text_stemming_verb,tweet_text_stemming_adv,tweet_text_lemmatization_char_len,tweet_text_lemmatization_word_len,tweet_text_lemmatization_noun,tweet_text_lemmatization_adj,tweet_text_lemmatization_verb,tweet_text_lemmatization_adv
0,"['parc', 'chines', 'vacin', 'prev', 'troc', 'c...","['parceria', 'chinês', 'vacina', 'preve', 'tro...",noite,sao paulo brasil,São Paulo,Sudeste,164,18,6,4,3,0,192,18,10,2,4,0
1,"['por qu', 'deix', 'minion', 'nerv', 'jur', 'i...","['por que', 'deixar', 'minion', 'nervoso', 'ju...",noite,brasil,statenotdefined,regionnotdefined,139,16,6,3,1,0,165,16,7,3,3,0
2,"['espirito sant', 'peligr', 'la', 'vacun', 'va...","['espirito santo', 'peligrosa', 'lo', 'vacuna'...",noite,barcelona,notbrazilstate,notbrazilregion,51,6,1,1,0,0,57,6,2,0,0,0
3,"['enta', 'perd', 'aqu', 'grup', 'placeb', 'peg...","['entao', 'perder', 'aqui', 'grupo', 'placebo'...",noite,sao paulo,São Paulo,Sudeste,138,17,9,1,2,1,156,17,9,0,5,2
4,"['vid', 'import', 'sim', 'vid', 'import', 'rec...","['vida', 'importar', 'sim', 'vida', 'importar'...",noite,invalidlocation,invalidstate,invalidregion,136,16,8,2,1,1,158,16,9,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,"['leu', 'manchet', 'vacin', 'inform', 'sobr', ...","['ler', 'manchete', 'vacina', 'informar', 'sob...",noite,timbaktu,notbrazilstate,notbrazilregion,65,7,3,1,1,0,73,7,3,0,3,0
3596,"['ver', 'esquerd', 'grud', 'bol', 'dor', 'vaci...","['ver', 'esquerdinha', 'grudar', 'bola', 'dori...",noite,invalidlocation,invalidstate,invalidregion,57,7,4,2,1,0,69,7,3,1,3,0
3597,"['nad', 'ver', 'nad', 'diss', 'sequ', 'vot', '...","['nada', 'ver', 'nada', 'de isso', 'sequer', '...",manhã,sao paulo,São Paulo,Sudeste,164,20,7,2,1,0,197,22,9,0,7,0
3598,"['sim', 'ant', 'sinaliz', 'apen', 'milho', 'va...","['sim', 'antes', 'sinalizar', 'apenas', 'milho...",noite,invalidlocation,invalidstate,invalidregion,154,18,14,2,0,1,180,18,7,1,5,3


In [12]:
y = pd.DataFrame()
y['label'] = df.loc[:,'label_A']
y.label += 1

In [13]:
y

Unnamed: 0,label
0,0
1,1
2,1
3,0
4,1
...,...
3595,1
3596,1
3597,2
3598,1


### Separando dados de treinamento e de teste

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y.label, test_size = 0.30, random_state = 1)

### Construindo o pré-processador

In [15]:
preprocessor = ColumnTransformer(transformers=[
    ('tf-idf-stemming', TfidfVectorizer(), 'tweet_text_stemming'),
    ('tf-idf-lemmatization', TfidfVectorizer(), 'tweet_text_lemmatization'),
    ('one-hot-time-shift', OneHotEncoder(handle_unknown = 'ignore'), ['time_shift']),
    ('one-hot-location-treated', OneHotEncoder(handle_unknown = 'ignore'), ['location_treated']),
    ('one-hot-state-location', OneHotEncoder(handle_unknown = 'ignore'), ['state_location']),
    ('one-hot-region-location', OneHotEncoder(handle_unknown = 'ignore'), ['region_location']),
    ('min-max-tweet-text-stemming-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_char_len']),
    ('min-max-tweet-text-stemming-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_word_len']),
    ('min-max-tweet-text-stemming-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_noun']),
    ('min-max-tweet-text-stemming-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adj']),
    ('min-max-tweet-text-stemming-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_verb']),
    ('min-max-tweet-text-stemming-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adv']),
    ('min-max-tweet-text-lemmatization-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_char_len']),
    ('min-max-tweet-text-lemmatization-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_word_len']),
    ('min-max-tweet-text-lemmatization-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_noun']),
    ('min-max-tweet-text-lemmatization-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adj']),
    ('min-max-tweet-text-lemmatization-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_verb']),
    ('min-max-tweet-text-lemmatization-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adv']),
    ],
    #remainder='passthrough'
    )

In [16]:
if oversampling == True:
    oversampling_selecionado = ('oversampler', SMOTE(random_state=1))
else:
    oversampling_selecionado = ('oversampler', 'passthrough')

In [17]:
if undersampling == True:
    undersampling_selecionado = ('undersampler', RandomUnderSampler(random_state=1))
else:
    undersampling_selecionado = ('undersampler', 'passthrough')

 ### Seleciona classificador para o pipeline

In [18]:
if classificador == 'decisiontree':
    classificador_selecionado = ('decisiontree', DecisionTreeClassifier())
elif classificador == 'multinomial-naive-bayes':
    classificador_selecionado = ('multinomial-naive-bayes', MultinomialNB())
elif classificador == 'svc':
    classificador_selecionado = ('svc', SVC())
elif classificador == 'randomforest':
    classificador_selecionado = ('randomforest', RandomForestClassifier())
elif classificador == 'adaboost':
    classificador_selecionado = ('adaboost', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
elif classificador == 'xgboost':
    classificador_selecionado = ('xgboost', XGBClassifier())

### Criando o modelo usando pipeline

In [19]:
model = imblearnPipeline(steps=[
    ('preprocessor', preprocessor),

    oversampling_selecionado,
    undersampling_selecionado,
    
    classificador_selecionado,
])

In [20]:
model

### Parâmetros

In [21]:
if classificador == 'decisiontree':
    parameters = {  
    'decisiontree__criterion': ['gini', 'entropy'],
    'decisiontree__splitter': ['best', 'random'],
    'decisiontree__max_depth': [9, 10, None],
    'decisiontree__min_samples_split': [2, 3, 4, 5],
    'decisiontree__min_samples_leaf': [1, 2, 3],
    #'decisiontree__min_weight_fraction_leaf': [0.0],
    #'decisiontree__max_features': [None],
    'decisiontree__random_state': [1, None],
    #'decisiontree__max_leaf_nodes': [None],
    #'decisiontree__min_impurity_decrease': [0.0],
    #'decisiontree__class_weight': [None],
    #'decisiontree__ccp_alpha': [0.0]
            }
elif classificador == 'multinomial-naive-bayes':
    parameters = {  
    'multinomial-naive-bayes__alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
    #'multinomial-naive-bayes__fit_prior': [True, False],
    'multinomial-naive-bayes__class_prior': [None]
            }
elif classificador == 'svc':
    parameters = {  
    'svc__C': [1, 10, 100, 1000],
    'svc__kernel': ['rbf'],
    #'svc__degree': [3],
    'svc__gamma': [1, 0.1],
    #'svc__coef0': [0.0],
    #'svc__shrinking': [True],
    #'svc__probability': [False],
    #'svc__tol': [1e-3],
    #'svc__cache_size': [200],
    #'svc__class_weight': [None],
    #'svc__verbose': [False],
    #'svc__max_iter': [-1],
    #'svc__decision_function_shape': ['ovr', 'ovo'],
    #'svc__break_ties': [False],
    'svc__random_state': [1, None]
            }
elif classificador == 'randomforest':
    parameters = { 
    'randomforest__n_estimators': [100, 200, 400, 600, 800, 1000],
    'randomforest__criterion': ['gini', 'entropy'],
    'randomforest__max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, None],
    'randomforest__bootstrap': [True, False],
    'randomforest__max_features': ['sqrt', 'log2'],
    'randomforest__min_samples_leaf': [1, 2, 4, 6, 8, 10] ,
    'randomforest__min_samples_split': [2, 4, 6, 8, 10, 100],
    'randomforest__random_state': [1],
              }
elif classificador == 'adaboost':
    parameters = { 
    #'adaboost__base_estimator__criterion': ['gini', 'entropy'],
    #'adaboost__base_estimator__splitter': ['best', 'random'],
    #'adaboost__base_estimator__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
    #'adaboost__base_estimator__min_samples_split': [2, 3, 4, 5],
    #'adaboost__base_estimator__min_samples_leaf': [1, 2, 3],
    #'adaboost__base_estimator__random_state': [1, None],
    'adaboost__n_estimators': [10, 50, 100, 500],
    'adaboost__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
    #'adaboost__algorithm': ['SAMME', 'SAMME.R'],
    #'adaboost__random_state': [1, None]
               }
elif classificador == 'xgboost':
    parameters = {  
    'xgboost__nthread': [4], #when use hyperthread, xgboost may become slower
    'xgboost__learning_rate': [0.01], #so called `eta` value
    'xgboost__max_depth': [7],
    #'xgboost__min_child_weight': [11],
    'xgboost__subsample': [0.8],
    'xgboost__colsample_bytree': [0.8],
    'xgboost__n_estimators': [1000], #number of trees, change it to 1000 for better results
    #'xgboost__missing': [-999],
    #'xgboost__seed': [1337],
    #'xgboost__booster': ['gbdt'],
    #'xgboost__metric': ['multiclass'],
    'xgboost__eval_metric': ['mlogloss'],
    #'xgboost__silent': [False], 
    #'xgboost__scale_pos_weight': [1],  
    #'xgboost__subsample': [0.8],
    'xgboost__objective': ['multi:softmax'], 
    'xgboost__reg_alpha': [0.3],
    'xgboost__gamma': [0, 1],
    'xgboost__use_label_encoder': [False],
    'xgboost__num_class': [3]
            }

### K-Fold

In [22]:
kfold = KFold(n_splits=10)

### Grid Search Cross Validation

In [23]:
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1, verbose=1)

### Treinamento

In [None]:
grid.fit(X_train, y_train)

Fitting 10 folds for each of 19008 candidates, totalling 190080 fits


### Predição

In [None]:
y_pred = grid.best_estimator_.predict(X_test)

### F1 Score

In [None]:
f1_score(y_test,y_pred, average=None)

### Classification report

In [None]:
print(classification_report(y_pred,y_test, target_names=['news', 'opinion', 'fake']))

### Melhor parâmetro

In [None]:
grid.best_params_

### Gerar dataframe de resultados

In [None]:
#result_df = pd.DataFrame(grid.cv_results_)