In [1]:
# PIVIC - "Um modelo computacional para identificação de notícias falsas sobre a Covid-19 no Brasil"
# Code: Machine Learning - Supervised Learning
# Author: Anísio Pereira Batista Filho

In [2]:
##Essentials
import os
import csv
import numpy as np
import pandas as pd
import gensim
import time

####Machine learning algorithms
#from xgboost import XGBClassifier
#from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

###Balancing techniques
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

##Model selection
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV

###Pipeline
from sklearn.pipeline import Pipeline as sklearnPipeline
from imblearn.pipeline import Pipeline as imblearnPipeline
from sklearn.compose import ColumnTransformer

### Preprocessing
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.feature_extraction.text import  TfidfVectorizer

###Metrics
from sklearn.metrics import classification_report, f1_score

In [3]:
pd.set_option("display.max_columns", None)

## Configuração

### Classficador

In [4]:
classificador = 'randomforest'

### Oversampling

In [5]:
oversampling = True

### Undersampling

In [6]:
undersampling = True

## Abertura de arquivo e criação do dataframe

In [7]:
df = pd.read_csv('data/corpus_labeled/iguais/bases_tcc/03_geracao_carcteristicas_base.csv', sep=",", low_memory=False)

In [8]:
#resultados_base = pd.read_csv('models/05_model_creation.csv', low_memory=True)

In [9]:
y = pd.DataFrame()
y['label'] = df.loc[:,'label_A']
y.label += 1

In [10]:
y.shape

(3600, 1)

In [11]:
X = pd.read_csv("data/corpus_labeled/iguais/bases_tcc/05_word2vec_model_creation_base.csv", sep=",", low_memory=False)

### Separando dados de treinamento e de teste

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y.label, test_size = 0.30, random_state = 1)

### Construindo o pré-processador

In [13]:
preprocessor = ColumnTransformer(transformers=[
#    ('tf-idf-stemming', TfidfVectorizer(), 'tweet_text_stemming'),
#    ('tf-idf-lemmatization', TfidfVectorizer(), 'tweet_text_lemmatization'),
#    ('one-hot-time-shift', OneHotEncoder(handle_unknown = 'ignore'), ['time_shift']),
#    ('one-hot-location-treated', OneHotEncoder(handle_unknown = 'ignore'), ['location_treated']),
#    ('one-hot-state-location', OneHotEncoder(handle_unknown = 'ignore'), ['state_location']),
#    ('one-hot-region-location', OneHotEncoder(handle_unknown = 'ignore'), ['region_location']),
#    ('min-max-tweet-text-stemming-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_char_len']),
#    ('min-max-tweet-text-stemming-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_word_len']),
#    ('min-max-tweet-text-stemming-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_noun']),
#    ('min-max-tweet-text-stemming-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adj']),
#    ('min-max-tweet-text-stemming-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_verb']),
#    ('min-max-tweet-text-stemming-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adv']),
#    ('min-max-tweet-text-lemmatization-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_char_len']),
#    ('min-max-tweet-text-lemmatization-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_word_len']),
#    ('min-max-tweet-text-lemmatization-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_noun']),
#    ('min-max-tweet-text-lemmatization-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adj']),
#    ('min-max-tweet-text-lemmatization-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_verb']),
#    ('min-max-tweet-text-lemmatization-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adv']),
    ],
    remainder='passthrough'
    )

In [14]:
if oversampling == True:
    oversampling_selecionado = ('oversampler', SMOTE(sampling_strategy='not majority', random_state=1))
else:
    oversampling_selecionado = ('oversampler', 'passthrough')

In [15]:
if undersampling == True:
    undersampling_selecionado = ('undersampler', RandomUnderSampler(sampling_strategy='majority', random_state=1))
else:
    undersampling_selecionado = ('undersampler', 'passthrough')

 ### Seleciona classificador para o pipeline

In [16]:
if classificador == 'randomforest':
    classificador_selecionado = ('randomforest', RandomForestClassifier())
elif classificador == 'xgboost':
    classificador_selecionado = ('xgboost', XGBClassifier())

### Criando o modelo usando pipeline

In [17]:
model = imblearnPipeline(steps=[
    ('preprocessor', preprocessor),
    oversampling_selecionado,
    undersampling_selecionado,
    
    classificador_selecionado,
])

### Parâmetros

In [18]:
if classificador == 'randomforest':
    parameters_random = { 
    'randomforest__min_samples_leaf': list(range(1, 60, 2)),
    'randomforest__max_features': list(np.arange(0.1, 1, 0.2)),
    'randomforest__max_depth': list(range(6, 20, 2)),
    'randomforest__n_estimators': list(range(10, 200, 10)),
    'randomforest__random_state': [1],
              }

    
elif classificador == 'xgboost':
    parameters_random = {  
    'xgboost__nthread': [4], #when use hyperthread, xgboost may become slower
    'xgboost__learning_rate': [0.01], #so called `eta` value
    'xgboost__max_depth': [7],
    #'xgboost__min_child_weight': [11],
    'xgboost__subsample': [0.8],
    'xgboost__colsample_bytree': [0.8],
    'xgboost__n_estimators': [1000], #number of trees, change it to 1000 for better results
    #'xgboost__missing': [-999],
    #'xgboost__seed': [1337],
    #'xgboost__booster': ['gbdt'],
    #'xgboost__metric': ['multiclass'],
    'xgboost__eval_metric': ['mlogloss'],
    #'xgboost__silent': [False], 
    #'xgboost__scale_pos_weight': [1],  
    #'xgboost__subsample': [0.8],
    'xgboost__objective': ['multi:softmax'], 
    'xgboost__reg_alpha': [0.3],
    'xgboost__gamma': [0, 1],
    'xgboost__use_label_encoder': [False],
    'xgboost__num_class': [3]
            }

### Randomized Search Cross Validation

In [19]:
random = RandomizedSearchCV(estimator=model, param_distributions=parameters_random, n_iter=50, cv=3, scoring='f1_weighted', n_jobs=1, verbose=3, random_state=1)

### Treinamento

In [20]:
random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3] END randomforest__max_depth=6, randomforest__max_features=0.1, randomforest__min_samples_leaf=25, randomforest__n_estimators=80, randomforest__random_state=1;, score=0.626 total time=   1.6s
[CV 2/3] END randomforest__max_depth=6, randomforest__max_features=0.1, randomforest__min_samples_leaf=25, randomforest__n_estimators=80, randomforest__random_state=1;, score=0.639 total time=   1.3s
[CV 3/3] END randomforest__max_depth=6, randomforest__max_features=0.1, randomforest__min_samples_leaf=25, randomforest__n_estimators=80, randomforest__random_state=1;, score=0.608 total time=   1.3s
[CV 1/3] END randomforest__max_depth=14, randomforest__max_features=0.30000000000000004, randomforest__min_samples_leaf=21, randomforest__n_estimators=130, randomforest__random_state=1;, score=0.643 total time=   6.0s
[CV 2/3] END randomforest__max_depth=14, randomforest__max_features=0.30000000000000004, randomforest__min_samples_leaf=

### Predição

In [21]:
y_pred_random = random.best_estimator_.predict(X_test)

### F1 score

In [22]:
f_score_random = f1_score(y_test, y_pred_random, average=None)
f_score_random

array([0.54135338, 0.8103321 , 0.30769231])

### Classification report

In [23]:
class_report_random = classification_report(y_pred_random, y_test, target_names=['news', 'opinion', 'fake'])
print(class_report_random)

              precision    recall  f1-score   support

        news       0.63      0.48      0.54       302
     opinion       0.77      0.86      0.81       641
        fake       0.31      0.31      0.31       137

    accuracy                           0.68      1080
   macro avg       0.57      0.55      0.55      1080
weighted avg       0.67      0.68      0.67      1080



### Melhor parâmetro

In [24]:
random.best_params_

{'randomforest__random_state': 1,
 'randomforest__n_estimators': 190,
 'randomforest__min_samples_leaf': 3,
 'randomforest__max_features': 0.1,
 'randomforest__max_depth': 18}

In [25]:
if classificador == 'randomforest':
    parameters_grid = { 
    'randomforest__min_samples_leaf': list(range(2, 4, 1)),
    'randomforest__max_features': list(np.arange(0.1, 0.3, 0.1)),
    'randomforest__max_depth': list(range(16, 20, 1)),
    'randomforest__n_estimators': list(range(180, 200, 5)),
    'randomforest__random_state': [1],
              }


### Grid Search Cross Validation

In [26]:
grid = GridSearchCV(estimator=model, param_grid=parameters_grid, cv=10, scoring='f1_weighted', n_jobs=1, verbose=3)

### Treinamento

In [27]:
grid.fit(X_train, y_train)

Fitting 10 folds for each of 64 candidates, totalling 640 fits
[CV 1/10] END randomforest__max_depth=16, randomforest__max_features=0.1, randomforest__min_samples_leaf=2, randomforest__n_estimators=180, randomforest__random_state=1;, score=0.650 total time=   5.4s
[CV 2/10] END randomforest__max_depth=16, randomforest__max_features=0.1, randomforest__min_samples_leaf=2, randomforest__n_estimators=180, randomforest__random_state=1;, score=0.676 total time=   5.2s
[CV 3/10] END randomforest__max_depth=16, randomforest__max_features=0.1, randomforest__min_samples_leaf=2, randomforest__n_estimators=180, randomforest__random_state=1;, score=0.682 total time=   5.2s
[CV 4/10] END randomforest__max_depth=16, randomforest__max_features=0.1, randomforest__min_samples_leaf=2, randomforest__n_estimators=180, randomforest__random_state=1;, score=0.720 total time=   5.4s
[CV 5/10] END randomforest__max_depth=16, randomforest__max_features=0.1, randomforest__min_samples_leaf=2, randomforest__n_estim

### Predição

In [28]:
y_pred_grid = grid.best_estimator_.predict(X_test)

### F1 score

In [29]:
f_score_grid = f1_score(y_test, y_pred_grid, average=None)
f_score_grid

array([0.52091255, 0.79912023, 0.25185185])

### Classification report

In [30]:
class_report_grid = classification_report(y_pred_grid, y_test, target_names=['news', 'opinion', 'fake'])
print(class_report_grid)

              precision    recall  f1-score   support

        news       0.60      0.46      0.52       296
     opinion       0.76      0.84      0.80       650
        fake       0.25      0.25      0.25       134

    accuracy                           0.66      1080
   macro avg       0.54      0.52      0.52      1080
weighted avg       0.65      0.66      0.65      1080



### Melhor parâmetro

In [31]:
grid.best_params_

{'randomforest__max_depth': 17,
 'randomforest__max_features': 0.2,
 'randomforest__min_samples_leaf': 3,
 'randomforest__n_estimators': 185,
 'randomforest__random_state': 1}

### Gerar dataframe de resultados

In [32]:
#resultados_base = pd.DataFrame(columns=['classifier', 'oversampling', 'undersampling', 
#                                        'parameters_random', 'f1_score_random', 'classification_report_random', 'best_params_random'
#                                        'parameters_grid', 'f1_score_grid', 'classification_report_grid', 'best_params_grid'])
#resultados_base.to_csv('models/06_parameters_selection.csv', index=False)

In [33]:
resultados_base = pd.read_csv('models/06_parameters_selection.csv', low_memory=True)

In [36]:
lista_resultados = [classificador, oversampling, undersampling, 
                    parameters_random, f_score_random, class_report_random, random.best_params_, 
                    parameters_grid, f_score_grid, class_report_grid, grid.best_params_]

In [37]:
resultados_base.loc[len(resultados_base)] = lista_resultados
resultados_base.to_csv('models/06_parameters_selection.csv', index=False)