In [1]:
# PIVIC - "Um modelo computacional para identificação de notícias falsas sobre a Covid-19 no Brasil"
# Code: Machine Learning - Supervised Learning
# Author: Anísio Pereira Batista Filho

In [2]:
##Essentials
import os
import csv
import numpy as np
import pandas as pd
import gensim
import time
import joblib

####Machine learning algorithms
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

###Balancing techniques
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

##Model selection
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV

###Pipeline
from sklearn.pipeline import Pipeline as sklearnPipeline
from imblearn.pipeline import Pipeline as imblearnPipeline
from sklearn.compose import ColumnTransformer


###Metrics
from sklearn.metrics import classification_report, f1_score

In [3]:
pd.set_option("display.max_columns", None)

## Configuração

### Classficador

In [4]:
classificador = 'xgboost'

### Oversampling

In [5]:
oversampling = True

### Undersampling

In [6]:
undersampling = True

## Abertura de arquivo e criação do dataframe

In [7]:
resultados_base = pd.read_csv('models/06_parameters_selection_base.csv', low_memory=True)

In [8]:
df = pd.read_csv('data/corpus_labeled/iguais/bases_tcc/03_geracao_carcteristicas_base.csv', sep=",", low_memory=False)

In [9]:
y = pd.DataFrame()
y['label'] = df.loc[:,'label_A']
y.label += 1

In [10]:
y.shape

(3600, 1)

In [11]:
X = pd.read_csv("data/corpus_labeled/iguais/bases_tcc/05_word2vec_model_creation_base.csv", sep=",", low_memory=False)

### Separando dados de treinamento e de teste

In [12]:
#X_train, X_test, y_train, y_test = train_test_split(X, y.label, test_size = 0.30, random_state = 1)

### Construindo o pré-processador

In [13]:
preprocessor = ColumnTransformer(transformers=[
#    ('tf-idf-stemming', TfidfVectorizer(), 'tweet_text_stemming'),
#    ('tf-idf-lemmatization', TfidfVectorizer(), 'tweet_text_lemmatization'),
#    ('one-hot-time-shift', OneHotEncoder(handle_unknown = 'ignore'), ['time_shift']),
#    ('one-hot-location-treated', OneHotEncoder(handle_unknown = 'ignore'), ['location_treated']),
#    ('one-hot-state-location', OneHotEncoder(handle_unknown = 'ignore'), ['state_location']),
#    ('one-hot-region-location', OneHotEncoder(handle_unknown = 'ignore'), ['region_location']),
#    ('min-max-tweet-text-stemming-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_char_len']),
#    ('min-max-tweet-text-stemming-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_word_len']),
#    ('min-max-tweet-text-stemming-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_noun']),
#    ('min-max-tweet-text-stemming-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adj']),
#    ('min-max-tweet-text-stemming-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_verb']),
#    ('min-max-tweet-text-stemming-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_stemming_adv']),
#    ('min-max-tweet-text-lemmatization-char-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_char_len']),
#    ('min-max-tweet-text-lemmatization-word-len', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_word_len']),
#    ('min-max-tweet-text-lemmatization-noun', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_noun']),
#    ('min-max-tweet-text-lemmatization-adj', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adj']),
#    ('min-max-tweet-text-lemmatization-verb', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_verb']),
#    ('min-max-tweet-text-lemmatization-adv', MinMaxScaler(feature_range=(0, 1)), ['tweet_text_lemmatization_adv']),
    ],
    remainder='passthrough'
    )

In [14]:
if oversampling == True:
    oversampling_selecionado = ('oversampler', SMOTE(sampling_strategy='not majority', random_state=1))
else:
    oversampling_selecionado = ('oversampler', 'passthrough')

In [15]:
if undersampling == True:
    undersampling_selecionado = ('undersampler', RandomUnderSampler(sampling_strategy='majority', random_state=1))
else:
    undersampling_selecionado = ('undersampler', 'passthrough')

### Parâmetros

In [16]:
filtra_parametros = resultados_base[(resultados_base.classifier == classificador) & (resultados_base.oversampling == oversampling) & (resultados_base.undersampling == undersampling)].reset_index()
parametros = filtra_parametros.best_params_grid[0]
parametros = parametros.replace(classificador+'__', '')
parametros = eval(parametros)
parametros

{'booster': 'gbtree',
 'colsample_bylevel': 0.8999999999999999,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.11,
 'max_depth': 19,
 'min_child_weight': 1.5,
 'random_state': 1,
 'reg_alpha': 0.5,
 'reg_lambda': 0.2,
 'subsample': 0.5,
 'tree_method': 'hist'}

 ### Seleciona classificador para o pipeline

In [17]:
if classificador == 'randomforest':
    classificador_selecionado = ('randomforest', RandomForestClassifier(max_depth=parametros['max_depth'], 
                                                                        max_features=parametros['max_features'], 
                                                                        min_samples_leaf=parametros['min_samples_leaf'], 
                                                                        n_estimators=parametros['n_estimators'], 
                                                                        random_state=parametros['random_state']))
elif classificador == 'xgboost':
    classificador_selecionado = ('xgboost', XGBClassifier(  booster=parametros['booster'],
                                                            colsample_bylevel=parametros['colsample_bylevel'],
                                                            colsample_bytree=parametros['colsample_bytree'],
                                                            gamma=parametros['gamma'],
                                                            learning_rate=parametros['learning_rate'],
                                                            max_depth=parametros['max_depth'],
                                                            min_child_weight=parametros['min_child_weight'],
                                                            random_state=parametros['random_state'],
                                                            reg_alpha=parametros['reg_alpha'],
                                                            reg_lambda=parametros['reg_lambda'],
                                                            subsample=parametros['subsample'],
                                                            tree_method=parametros['tree_method']))

### Criando o modelo usando pipeline

In [18]:
model = imblearnPipeline(steps=[
    ('preprocessor', preprocessor),
    oversampling_selecionado,
    undersampling_selecionado,
    
    classificador_selecionado,
])

### Avaliando os modelos

In [19]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)

In [20]:
#resultados_fscore = pd.DataFrame(columns=['classifier', 'oversampling', 'undersampling', 'classification_report', 'f_score_news', 'f_score_opinion', 'f_score_fake'])
resultados_fscore = pd.read_csv('models/07_model_avaliation_base.csv')

In [21]:
split_num = 1
for train_index, test_index in kf.split(X):
    print('Split:', split_num)   
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.label[train_index], y.label[test_index]

    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)

    f_score = f1_score(y_test, pred_values, average=None)
    print(f_score)
    class_report = classification_report(y_test, pred_values, target_names=['news', 'opinion', 'fake_news'])

    lista_resultados = [classificador, oversampling, undersampling, 
                        class_report, f_score[0], f_score[1], f_score[2]]
    resultados_fscore.loc[len(resultados_fscore)] = lista_resultados
    resultados_fscore.to_csv('models/07_model_avaliation_base.csv', index=False)
    split_num = split_num + 1

Split: 1
[0.5        0.81276596 0.34146341]
Split: 2
[0.56338028 0.86507937 0.27027027]
Split: 3
[0.56774194 0.81702128 0.31578947]
Split: 4
[0.49315068 0.8125     0.31914894]
Split: 5
[0.50359712 0.84630739 0.375     ]
Split: 6
[0.47482014 0.81967213 0.38709677]
Split: 7
[0.5034965  0.83636364 0.36585366]
Split: 8
[0.46451613 0.79915433 0.41304348]
Split: 9
[0.4875     0.80168776 0.1627907 ]
Split: 10
[0.43971631 0.77731092 0.27184466]


### Salvando o modelo

In [22]:
if classificador == 'randomforest':
    tag = 'RF'
elif classificador == 'xgboost':
    tag = 'XGB'

filename = 'models/model-'+tag+'_OV_'+str(oversampling)+'_UN_'+str(undersampling)+'.sav'
joblib.dump(model, filename, compress=3)

['models/model-XGB_OV_True_UN_True.sav']