In [1]:
# PIVIC - "Um modelo computacional para identificação de notícias falsas sobre a Covid-19 no Brasil"
# Code: Machine Learning - Supervised Learning
# Author: Anísio Pereira Batista Filho

In [2]:
##Essentials
import os
import csv
import numpy as np
import pandas as pd
import gensim
import time

####Machine learning algorithms
from xgboost import XGBClassifier
#from sklearn.ensemble import RandomForestClassifier

###Balancing techniques
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

##Model selection
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV

###Pipeline
from sklearn.pipeline import Pipeline as sklearnPipeline
from imblearn.pipeline import Pipeline as imblearnPipeline
from sklearn.compose import ColumnTransformer

### Preprocessing
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.feature_extraction.text import  TfidfVectorizer

###Metrics
from sklearn.metrics import classification_report, f1_score

In [3]:
pd.set_option("display.max_columns", None)

## Configuração

### Classficador

In [4]:
classificador = 'xgboost'

### Oversampling

In [5]:
oversampling = False

### Undersampling

In [6]:
undersampling = True

## Abertura de arquivo e criação do dataframe

In [7]:
df = pd.read_csv('data/corpus_labeled/iguais/bases_tcc/03_geracao_carcteristicas_base.csv', sep=",", low_memory=False)

In [8]:
#resultados_base = pd.read_csv('models/05_model_creation.csv', low_memory=True)

In [9]:
y = pd.DataFrame()
y['label'] = df.loc[:,'label_A']
y.label += 1

In [10]:
y.shape

(3600, 1)

In [11]:
X = pd.read_csv("data/corpus_labeled/iguais/bases_tcc/05_word2vec_model_creation_base.csv", sep=",", low_memory=False)

### Separando dados de treinamento e de teste

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y.label, test_size = 0.30, random_state = 1)

### Construindo o pré-processador

In [13]:
preprocessor = ColumnTransformer(transformers=[
#    ('tf-idf-stemming', TfidfVectorizer(), 'tweet_text_stemming'),
#    ('tf-idf-lemmatization', TfidfVectorizer(), 'tweet_text_lemmatization'),
#    ('one-hot-time-shift', OneHotEncoder(handle_unknown = 'ignore'), ['time_shift']),
#    ('one-hot-location-treated', OneHotEncoder(handle_unknown = 'ignore'), ['location_treated']),
#    ('one-hot-state-location', OneHotEncoder(handle_unknown = 'ignore'), ['state_location']),
#    ('one-hot-region-location', OneHotEncoder(handle_unknown = 'ignore'), ['region_location']),
#    ('min-max-tweet-text-stemming-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_char_len']),
#    ('min-max-tweet-text-stemming-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_word_len']),
#    ('min-max-tweet-text-stemming-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_noun']),
#    ('min-max-tweet-text-stemming-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adj']),
#    ('min-max-tweet-text-stemming-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_verb']),
#    ('min-max-tweet-text-stemming-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adv']),
#    ('min-max-tweet-text-lemmatization-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_char_len']),
#    ('min-max-tweet-text-lemmatization-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_word_len']),
#    ('min-max-tweet-text-lemmatization-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_noun']),
#    ('min-max-tweet-text-lemmatization-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adj']),
#    ('min-max-tweet-text-lemmatization-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_verb']),
#    ('min-max-tweet-text-lemmatization-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adv']),
    ],
    remainder='passthrough'
    )

In [14]:
if oversampling == True:
    oversampling_selecionado = ('oversampler', SMOTE(sampling_strategy='not majority', random_state=1))
else:
    oversampling_selecionado = ('oversampler', 'passthrough')

In [15]:
if undersampling == True:
    undersampling_selecionado = ('undersampler', RandomUnderSampler(sampling_strategy='majority', random_state=1))
else:
    undersampling_selecionado = ('undersampler', 'passthrough')

 ### Seleciona classificador para o pipeline

In [16]:
if classificador == 'randomforest':
    classificador_selecionado = ('randomforest', RandomForestClassifier())
elif classificador == 'xgboost':
    classificador_selecionado = ('xgboost', XGBClassifier())

### Criando o modelo usando pipeline

In [17]:
model = imblearnPipeline(steps=[
    ('preprocessor', preprocessor),
    oversampling_selecionado,
    undersampling_selecionado,
    
    classificador_selecionado,
])

### Parâmetros

In [18]:
if classificador == 'randomforest':
    parameters_random = { 
    'randomforest__min_samples_leaf': list(range(1, 60, 2)),
    'randomforest__max_features': list(np.arange(0.1, 1, 0.2)),
    'randomforest__max_depth': list(range(6, 20, 2)),
    'randomforest__n_estimators': list(range(10, 200, 10)),
    'randomforest__random_state': [1],
              }

    
elif classificador == 'xgboost':
    parameters_random = {
    'xgboost__booster': ['gbtree', 'dart'],
    'xgboost__tree_method': ['exact', 'hist', 'approx'],
    'xgboost__max_depth': list(range(3, 20, 2)),
    'xgboost__learning_rate': list(np.arange(0.01, 0.5, 0.05)),
    'xgboost__gamma': list(np.arange(0.0, 1.0, 0.2)),
    'xgboost__reg_alpha': list(np.arange(0.0, 1.0, 0.2)),
    'xgboost__reg_lambda': list(np.arange(0.0, 1.0, 0.2)),
    'xgboost__min_child_weight': list(np.arange(0.0, 5.0, 0.5)),
    'xgboost__subsample': list(np.arange(0.5, 1.0, 0.1)),
    'xgboost__colsample_bytree': list(np.arange(0.5, 1.0, 0.1)),
    'xgboost__colsample_bylevel': list(np.arange(0.5, 1.0, 0.1)),
    'xgboost__random_state': [1],
            }

### Randomized Search Cross Validation

In [19]:
random = RandomizedSearchCV(estimator=model, param_distributions=parameters_random, n_iter=50, cv=3, scoring='f1_weighted', n_jobs=1, verbose=3, random_state=1)

### Treinamento

In [20]:
random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3] END xgboost__booster=dart, xgboost__colsample_bylevel=0.5, xgboost__colsample_bytree=0.7, xgboost__gamma=0.4, xgboost__learning_rate=0.01, xgboost__max_depth=17, xgboost__min_child_weight=2.0, xgboost__random_state=1, xgboost__reg_alpha=0.2, xgboost__reg_lambda=0.6000000000000001, xgboost__subsample=0.7999999999999999, xgboost__tree_method=approx;, score=0.563 total time=  50.0s
[CV 2/3] END xgboost__booster=dart, xgboost__colsample_bylevel=0.5, xgboost__colsample_bytree=0.7, xgboost__gamma=0.4, xgboost__learning_rate=0.01, xgboost__max_depth=17, xgboost__min_child_weight=2.0, xgboost__random_state=1, xgboost__reg_alpha=0.2, xgboost__reg_lambda=0.6000000000000001, xgboost__subsample=0.7999999999999999, xgboost__tree_method=approx;, score=0.577 total time=  50.4s
[CV 3/3] END xgboost__booster=dart, xgboost__colsample_bylevel=0.5, xgboost__colsample_bytree=0.7, xgboost__gamma=0.4, xgboost__learning_rate=0.01, xgboost_

### Predição

In [21]:
y_pred_random = random.best_estimator_.predict(X_test)

### F1 score

In [22]:
f_score_random = f1_score(y_test, y_pred_random, average=None)
f_score_random

array([0.45192308, 0.69449082, 0.24852071])

### Classification report

In [23]:
class_report_random = classification_report(y_pred_random, y_test, target_names=['news', 'opinion', 'fake'])
print(class_report_random)

              precision    recall  f1-score   support

        news       0.61      0.36      0.45       394
     opinion       0.58      0.86      0.69       484
        fake       0.31      0.21      0.25       202

    accuracy                           0.55      1080
   macro avg       0.50      0.48      0.46      1080
weighted avg       0.54      0.55      0.52      1080



### Melhor parâmetro

In [24]:
random.best_params_

{'xgboost__tree_method': 'hist',
 'xgboost__subsample': 0.7999999999999999,
 'xgboost__reg_lambda': 0.6000000000000001,
 'xgboost__reg_alpha': 0.8,
 'xgboost__random_state': 1,
 'xgboost__min_child_weight': 4.5,
 'xgboost__max_depth': 3,
 'xgboost__learning_rate': 0.26,
 'xgboost__gamma': 0.8,
 'xgboost__colsample_bytree': 0.8999999999999999,
 'xgboost__colsample_bylevel': 0.8999999999999999,
 'xgboost__booster': 'dart'}

In [31]:
if classificador == 'xgboost':
    parameters_grid = {'xgboost__tree_method': ['hist'],
 'xgboost__subsample': list(np.arange(0.7, 0.9, 0.1)),
 'xgboost__reg_lambda': list(np.arange(0.5, 0.7, 0.1)),
 'xgboost__reg_alpha': list(np.arange(0.7, 0.9, 0.1)),
 'xgboost__random_state': [1],
 'xgboost__min_child_weight': list(np.arange(4.0, 5.0, 0.5)),
 'xgboost__max_depth': list(range(2, 4, 1)),
 'xgboost__learning_rate': list(np.arange(0.25, 0.27, 0.01)),
 'xgboost__gamma': list(np.arange(0.7, 0.8, 0.1)),
 'xgboost__colsample_bytree': list(np.arange(0.8, 1.0, 0.1)),
 'xgboost__colsample_bylevel': list(np.arange(0.8, 1.0, 0.1)),
 'xgboost__booster': ['dart']
 }


### Grid Search Cross Validation

In [32]:
grid = GridSearchCV(estimator=model, param_grid=parameters_grid, cv=10, scoring='f1_weighted', n_jobs=1, verbose=3)

### Treinamento

In [33]:
grid.fit(X_train, y_train)

Fitting 10 folds for each of 1728 candidates, totalling 17280 fits
[CV 1/10] END xgboost__booster=dart, xgboost__colsample_bylevel=0.8, xgboost__colsample_bytree=0.8, xgboost__gamma=0.7, xgboost__learning_rate=0.25, xgboost__max_depth=2, xgboost__min_child_weight=4.0, xgboost__random_state=1, xgboost__reg_alpha=0.7, xgboost__reg_lambda=0.5, xgboost__subsample=0.7, xgboost__tree_method=hist;, score=0.548 total time=  17.2s
[CV 2/10] END xgboost__booster=dart, xgboost__colsample_bylevel=0.8, xgboost__colsample_bytree=0.8, xgboost__gamma=0.7, xgboost__learning_rate=0.25, xgboost__max_depth=2, xgboost__min_child_weight=4.0, xgboost__random_state=1, xgboost__reg_alpha=0.7, xgboost__reg_lambda=0.5, xgboost__subsample=0.7, xgboost__tree_method=hist;, score=0.594 total time=  19.2s
[CV 3/10] END xgboost__booster=dart, xgboost__colsample_bylevel=0.8, xgboost__colsample_bytree=0.8, xgboost__gamma=0.7, xgboost__learning_rate=0.25, xgboost__max_depth=2, xgboost__min_child_weight=4.0, xgboost__rand

### Predição

In [34]:
y_pred_grid = grid.best_estimator_.predict(X_test)

### F1 score

In [35]:
f_score_grid = f1_score(y_test, y_pred_grid, average=None)
f_score_grid

array([0.47384615, 0.7120332 , 0.28196721])

### Classification report

In [36]:
class_report_grid = classification_report(y_pred_grid, y_test, target_names=['news', 'opinion', 'fake'])
print(class_report_grid)

              precision    recall  f1-score   support

        news       0.67      0.37      0.47       420
     opinion       0.60      0.87      0.71       491
        fake       0.32      0.25      0.28       169

    accuracy                           0.58      1080
   macro avg       0.53      0.50      0.49      1080
weighted avg       0.58      0.58      0.55      1080



### Melhor parâmetro

In [37]:
grid.best_params_

{'xgboost__booster': 'dart',
 'xgboost__colsample_bylevel': 0.8,
 'xgboost__colsample_bytree': 0.8,
 'xgboost__gamma': 0.7,
 'xgboost__learning_rate': 0.25,
 'xgboost__max_depth': 2,
 'xgboost__min_child_weight': 4.0,
 'xgboost__random_state': 1,
 'xgboost__reg_alpha': 0.7,
 'xgboost__reg_lambda': 0.5,
 'xgboost__subsample': 0.8999999999999999,
 'xgboost__tree_method': 'hist'}

### Gerar dataframe de resultados

In [38]:
#resultados_base = pd.DataFrame(columns=['classifier', 'oversampling', 'undersampling', 
#                                        'parameters_random', 'f1_score_random', 'classification_report_random', 'best_params_random'
#                                        'parameters_grid', 'f1_score_grid', 'classification_report_grid', 'best_params_grid'])
#resultados_base.to_csv('models/06_parameters_selection_base.csv', index=False)

In [39]:
resultados_base = pd.read_csv('models/06_parameters_selection_base.csv', low_memory=True)

In [40]:
lista_resultados = [classificador, oversampling, undersampling, 
                    parameters_random, f_score_random, class_report_random, random.best_params_, 
                    parameters_grid, f_score_grid, class_report_grid, grid.best_params_]

In [41]:
resultados_base.loc[len(resultados_base)] = lista_resultados
resultados_base.to_csv('models/06_parameters_selection_base.csv', index=False)