In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=6144)])
logical_gpus = tf.config.experimental.list_logical_devices('GPU')

In [3]:
import sys
import os

SEED = 42
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['HOROVOD_FUSION_THRESHOLD']='0'
os.environ['PYTHONHASHSEED']=str(SEED)

import tfdeterminism
import random
import numpy as np
import tensorflow as tf
import skmultilearn
import pandas as pd
from sklearn.metrics import classification_report, f1_score
import numpy as np
from sklearn import model_selection

In [4]:
print ("Tensorflow version:", tf.__version__)
print ("Numpy version:", np.__version__)
print ("tfdeterminism version", tfdeterminism.__version__)
print ("skmultilearn version", "0.2.0")

Tensorflow version: 2.2.0
Numpy version: 1.19.2
tfdeterminism version 0.3.0
skmultilearn version 0.2.0


In [5]:
# !git clone https://github.com/luinardi/hypermapper.git
test_set = pd.read_csv('sets/Set 0.csv')
set_1 = pd.read_csv('sets/Set 1.csv')
set_2 = pd.read_csv('sets/Set 2.csv')
set_3 = pd.read_csv('sets/Set 3.csv')
set_4 = pd.read_csv('sets/Set 4.csv')


def string_to_array_serie(serie):
    return [[int(i) for i in t.replace("[", "").replace("]", "").split()] for t in serie]

test_set["Senado"] = string_to_array_serie(test_set["Senado"])
set_1["Senado"] = string_to_array_serie(set_1["Senado"])
set_2["Senado"] = string_to_array_serie(set_2["Senado"])
set_3["Senado"] = string_to_array_serie(set_3["Senado"])
set_4["Senado"] = string_to_array_serie(set_4["Senado"])

test_set["Referenda"] = string_to_array_serie(test_set["Referenda"])
set_1["Referenda"] = string_to_array_serie(set_1["Referenda"])
set_2["Referenda"] = string_to_array_serie(set_2["Referenda"])
set_3["Referenda"] = string_to_array_serie(set_3["Referenda"])
set_4["Referenda"] = string_to_array_serie(set_4["Referenda"])

In [6]:
train_sets = [set_1, set_2, set_3, set_4]

full_train = pd.concat(train_sets)
full_train

Unnamed: 0,Numero ato,Ementa + texto completo,Senado,Referenda
0,10095,dispõe sobre o comitê consultivo de nanotecnol...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,10092,promulga o protocolo de integração educativa e...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,10085,dispõe sobre o programa forças no esporte segu...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]"
3,10081,altera o decreto n 8713 de 15 de abril de 2016...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]"
4,10083,autoriza o emprego das forças armadas na garan...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
1323,3336,dá nova redação aos arts 11 15 16 19 e 30 do...,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
1324,3330,dispõe sobre a redução do consumo de energia e...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1325,3338,aprova a estrutura regimental e o quadro demon...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]"
1326,3328,altera o decreto 2889 de 21 12 1998 que dispõe...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [7]:
import matplotlib.pyplot as plt
import numpy as np
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
import nltk
nltk.download('punkt')
from nltk import tokenize

# max_senten_len = 40
# max_senten_num = 48
max_senten_len = 49
max_senten_num = 68

def preprocess_HANS(dataset, max_sentence_num, max_sentence_len, tokenizer):
    data = np.zeros((len(dataset), max_sentence_num, max_sentence_len), dtype='int32')
    for i, document in enumerate(dataset):
        tokenized_document=tokenize.sent_tokenize(document)
        for j, sent in enumerate(tokenized_document):
            if j< max_sentence_num:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    try:
                        if k<max_sentence_len: #and tokenizer.word_index[word]<26486:
                            data[i,j,k] = tokenizer.word_index[word]
                            k=k+1
                    except:
                        pass
    return data

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, oov_token="OOV")
tokenizer.fit_on_texts(full_train["Ementa + texto completo"])

Train_X = preprocess_HANS(full_train["Ementa + texto completo"], max_senten_num, max_senten_len, tokenizer)
Test_X = preprocess_HANS(test_set["Ementa + texto completo"], max_senten_num, max_senten_len, tokenizer)

# The following sets will be used for validation.
set_1_X = preprocess_HANS(set_1["Ementa + texto completo"], max_senten_num, max_senten_len, tokenizer)
set_2_X = preprocess_HANS(set_2["Ementa + texto completo"], max_senten_num, max_senten_len, tokenizer)
set_3_X = preprocess_HANS(set_3["Ementa + texto completo"], max_senten_num, max_senten_len, tokenizer)
set_4_X = preprocess_HANS(set_4["Ementa + texto completo"], max_senten_num, max_senten_len, tokenizer)

sets_X = [set_1_X, set_2_X, set_3_X, set_4_X]

word_index = tokenizer.word_index

[nltk_data] Downloading package punkt to /home/caiocampos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
Train_Y_sen = full_train["Senado"].values
Test_Y_sen = test_set["Senado"].values
Train_Y_minist = full_train["Referenda"].values
Test_Y_minist = test_set["Referenda"].values

def get_class_weights(n_classes, labels):
    counters = np.zeros(n_classes)
    for label in labels:
        for i in range(len(label)):
            if(label[i] == 1):
                counters[i]+=1
    greater_class = counters[np.argmax(counters)]
    weights = []
    for i in range(n_classes):
        weights.append(greater_class/counters[i])
    return weights

classes_w_sen = get_class_weights(len(Train_Y_sen[0]), Train_Y_sen)
classes_w_sen = {v: k for v, k in enumerate(classes_w_sen)}
classes_w_minist = get_class_weights(len(Train_Y_minist[0]), Train_Y_minist)
classes_w_minist = {v: k for v, k in enumerate(classes_w_minist)}

In [9]:
set_1_Y_sen = set_1["Senado"].values
set_2_Y_sen = set_2["Senado"].values
set_3_Y_sen = set_3["Senado"].values
set_4_Y_sen = set_4["Senado"].values
sets_Y_sen = [set_1_Y_sen, set_2_Y_sen, set_3_Y_sen, set_4_Y_sen]

set_1_Y_minist = set_1["Referenda"].values
set_2_Y_minist = set_2["Referenda"].values
set_3_Y_minist = set_3["Referenda"].values
set_4_Y_minist = set_4["Referenda"].values
sets_Y_minist = [set_1_Y_minist, set_2_Y_minist, set_3_Y_minist, set_4_Y_minist]

In [10]:
from keras.layers import Embedding, Input, Dense, GRU, Bidirectional, TimeDistributed
from keras import backend as K
from keras import optimizers
from keras.models import Model
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical

In [11]:
#pre-trained Glove embedding
embeddings_index = {}
f = open('glove_s100.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(word, "could not be embedded.")
f.close()

print('\nFound %s word vectors.\n' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index.items())+1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
            embedding_matrix[i] = embedding_vector
        except:
            print(word, "could not be indexed.")

r$ could not be embedded.
00 could not be embedded.
三藏法師玄奘奉 could not be embedded.
r$ could not be embedded.

Found 929594 word vectors.

0 could not be indexed.
00 could not be indexed.


In [12]:
# See https://www.kaggle.com/sermakarevich/hierarchical-attention-network
# See https://gist.github.com/cbaziotis/7ef97ccf71cbc14366835198c09809d2


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatibl|e with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
#         self.init = initializers.get('glorot_uniform')
        self.init = initializers.glorot_uniform(seed=SEED)

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [13]:
#Taken from old keras code. The following is used only during training for visualization.
def get_f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [14]:
from sklearn.metrics import classification_report
def predict_classes(probs, tresh=0.5, consider_first=False):
    predicted_classes = []
    for prob in probs:
        predicted_class = np.zeros(len(probs[0])).astype(int)
        for i in range(len(prob)):
            if(prob[i]>=tresh):
                predicted_class[i] = 1
        if np.sum(predicted_class) == 0 and consider_first:
            position_1 = (np.argsort(prob)[::-1])[0]
            predicted_class[position_1] = 1
        predicted_classes.append(predicted_class.tolist())
    
    return predicted_classes

In [15]:
def acerto_absoluto(test_y_sen, test_pred, number_labels):
    acerto_100 = 0
    test_prediction_binary = predict_classes(test_pred)
    for i in range(len(test_prediction_binary)):
        acertos_internos = 0
        for j in range(len(test_prediction_binary[i])):
            if test_y_sen[i][j] == test_prediction_binary[i][j]:
                acertos_internos+=1
        if acertos_internos == number_labels:
            acerto_100+=1
    return acerto_100/len(test_prediction_binary)*100

In [16]:
"""
Create Keras functional model for hierarchical attention network
"""
from sklearn.metrics import f1_score, precision_score, recall_score
from keras.optimizers import Adam
import imp

def HAN_model(number_of_classes):
    K.clear_session()

    os.environ['PYTHONHASHSEED']=str(SEED)
    np.random.seed(SEED)
    random.seed(SEED)
    
    tf.random.set_seed(SEED)
    os.environ['TF_CUDNN_DETERMINISM']='1'
    os.environ['TF_DETERMINISTIC_OPS']='1'
    os.environ['HOROVOD_FUSION_THRESHOLD']='0'
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    
    
    embedding_layer = Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_shape=(Train_X.shape[1], ) )

    # Words level attention model
    word_input = Input(shape=(max_senten_len,), dtype='int32',name='word_input')
    word_sequences = embedding_layer(word_input)
    word_gru = Bidirectional(GRU(50, return_sequences=True, kernel_initializer=initializers.glorot_uniform(seed=SEED)),name='word_gru')(word_sequences)
    word_att = AttentionWithContext()(word_gru)#(word_dense)
    wordEncoder = Model(inputs = word_input,outputs = word_att)

    # Sentence level attention model
    sent_input = Input(shape=(max_senten_num, max_senten_len), dtype='int32',name='sent_input')
    sent_encoder = TimeDistributed(wordEncoder,name='sent_linking')(sent_input)
    sent_gru = Bidirectional(GRU(50, return_sequences=True, kernel_initializer=initializers.glorot_uniform(seed=SEED)),name='sent_gru')(sent_encoder)
    sent_att = AttentionWithContext()(sent_gru)#(sent_dense)
    preds = Dense(number_of_classes, activation='sigmoid', input_shape=(Train_X.shape[1],), kernel_initializer=initializers.glorot_uniform(seed=SEED))(sent_att)

    model = Model(sent_input, preds)
    return model

## Hyperparameter optimization with bayesian optimization and Cross-Validation

In [17]:
sen_classes_names = ["Saúde", "Relações Exteriores", "Meio ambiente", "Educação, cultura e esporte", "Segurança Pública e Defesa", "Trabalho e Previdência", "Agricultura, pecuária e pesca", "Ciência, tecnologia e comunicações", "Social", "Indústria, comércio, turismo, transporte/transporte de mercadorias", "Economia, planejamento e sistema financeiro", "Assuntos internos,  cargos e comissões, Estado", "Tributos", "Minas e Energia", "Justiça e direitos"]
minist_classes_names = ["Saúde", "Relações Exteriores", "Meio ambiente", "Educação, cultura e esporte", "Justiça e Segurança", "Trabalho e Previdência", "Transporte/transporte de mercadorias", "Agricultura, pecuária e pesca", "Ciência e tecnologia", "Social", "Presidência", "Economia e planejamento", "Indústria, comércio, obras públicas, turismo"]
stdout = sys.stdout

In [17]:
def generate_validation_csv(f1s_per_class, precision_per_class, recall_per_class, prefix, names):
    final_metrics = (f1s_per_class, np.array([np.mean(f1s_per_class, axis=0)]), np.array([np.std(f1s_per_class, axis=0)]),
                     precision_per_class, np.array([np.mean(precision_per_class, axis=0)]), np.array([np.std(precision_per_class, axis=0)]),
                     recall_per_class, np.array([np.mean(recall_per_class, axis=0)]), np.array([np.std(recall_per_class, axis=0)]) )
    f1s_per_class_pd = pd.DataFrame(np.concatenate(final_metrics, axis=0))
    f1s_per_class_pd.columns = names
    f1s_per_class_pd.index=["validation set 1 f1", "validation set 2 f1", "validation set 3 f1", "validation set 4 f1", "validation sets mean f1", "validation std f1",
                            "validation set 1 precision", "validation set 2 precision", "validation set 3 precision", "validation set 4 precision", "validation sets mean precision", "validation std precision",
                            "validation set 1 recall", "validation set 2 recall", "validation set 3 recall", "validation set 4 recall", "validation sets mean recall", "validation std recall"]
    f1s_per_class_pd.to_csv("../optimization/validation_results/"+prefix+"iteration_"+str(iterations)+".csv", index=True, header=True)

## Senado - otimização

In [None]:
import os
import sys
os.chdir("/home/caiocampos/andre/radar_wisemap")
sys.path.append('hypermapper/scripts/')
import hypermapper

iterations = 0

def optimize_HAN(parameters):
    global iterations
    f1s = 0
    f1s_per_class = []
    precision_per_class = []
    recall_per_class = []
    beta_2_real_p = [0.99,0.999,0.9999]  # O parâmetro beta2 é traduzido de categórico para um dos três reais descritos no paper do Adam.
    
    for i in range(len(train_sets)):        
        sets_index = []
        for t in range(len(train_sets)):
            if t!=i:
                sets_index.append(t)

        train_X = np.concatenate(np.array(sets_X)[sets_index])
        validation_X = sets_X[i]

        train_Y_sen = np.concatenate(np.array(sets_Y_sen)[sets_index])
        train_Y_sen = np.array([np.array(t) for t in train_Y_sen])
        validation_Y_sen = sets_Y_sen[i]
        validation_Y_sen = np.array([np.array(v) for v in validation_Y_sen])

        model = HAN_model(len(Train_Y_sen[0]))
        opt = Adam(lr=parameters['learning_rate'], beta_1=parameters['beta1'], beta_2=beta_2_real_p[parameters['beta2']])
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
        history = model.fit(x=np.array(train_X), y=np.array(train_Y_sen), epochs=parameters['epochs'], validation_split=0.0, class_weight=classes_w_sen, batch_size=parameters['batch_size'], shuffle=False, verbose=0)
        validation_prediction = model.predict(np.array(validation_X))

        f1s+=f1_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average='macro')
        f1s_per_class.append(f1_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average=None))
        precision_per_class.append(precision_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average=None))
        recall_per_class.append(recall_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average=None))
    generate_validation_csv(f1s_per_class, precision_per_class, recall_per_class, "HAN_sen_", sen_classes_names)
    f1_loss=1-f1s/4
    print("Iteration "+str(iterations)+":\nlearning_rate: ", parameters['learning_rate'], " || beta1: ", parameters['beta1'], " || beta2: ", beta_2_real_p[parameters['beta2']], " || epochs: ", parameters['epochs'], " || batch_size: ", parameters['batch_size'], " || (1 - macro_F1): ", f1_loss)
    iterations+=1
    return f1_loss

os.chdir("hypermapper/")
stdout = sys.stdout
print(os.getcwd())
hypermapper.optimize("../optimization/HAN_scenario.json", optimize_HAN)
os.chdir("/home/caiocampos/andre/radar_wisemap/")
sys.stdout = stdout

In [18]:
optimization_results = pd.read_csv("optimization/sen_han_validation_hypermapper_output.csv")
best_parameters_idx = optimization_results["1 - F1"].idxmin()
parameters = pd.DataFrame(optimization_results.iloc[best_parameters_idx]).transpose()
parameters

Unnamed: 0,learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
90,0.007347,0.802171,1.0,21.0,45.0,0.199191,37732822.0


In [19]:
validation_results = pd.read_csv("optimization/validation_results/HAN_sen_iteration_"+str(best_parameters_idx)+".csv", index_col=0)
validation_results

Unnamed: 0,Saúde,Relações Exteriores,Meio ambiente,"Educação, cultura e esporte",Segurança Pública e Defesa,Trabalho e Previdência,"Agricultura, pecuária e pesca","Ciência, tecnologia e comunicações",Social,"Indústria, comércio, turismo, transporte/transporte de mercadorias","Economia, planejamento e sistema financeiro","Assuntos internos, cargos e comissões, Estado",Tributos,Minas e Energia,Justiça e direitos
validation set 1 f1,0.745098,0.965636,0.764706,0.8,0.857143,0.875,0.787234,0.753623,0.77,0.637931,0.805755,0.864017,0.875,0.789474,0.613333
validation set 2 f1,0.615385,0.956672,0.774194,0.784314,0.82243,0.833333,0.8,0.819672,0.701031,0.666667,0.731518,0.862106,0.8875,0.873563,0.651163
validation set 3 f1,0.68,0.958621,0.820513,0.783505,0.858586,0.842105,0.863158,0.84058,0.694444,0.715447,0.737643,0.864461,0.888889,0.891566,0.607595
validation set 4 f1,0.738462,0.960413,0.865672,0.862745,0.847291,0.830189,0.722892,0.828571,0.787879,0.733333,0.805861,0.869927,0.890173,0.891566,0.682927
validation sets mean f1,0.694736,0.960335,0.806271,0.807641,0.846362,0.845157,0.793321,0.810612,0.738339,0.688345,0.770194,0.865128,0.885391,0.861542,0.638754
validation std f1,0.05235,0.003334,0.040274,0.032487,0.014485,0.017775,0.049797,0.033728,0.041156,0.03798,0.03568,0.002909,0.006073,0.042253,0.030505
validation set 1 precision,0.76,0.972318,0.787879,0.8,0.837838,0.875,0.787234,0.787879,0.785714,0.660714,0.8,0.878723,0.958904,0.857143,0.821429
validation set 2 precision,0.551724,0.975265,0.827586,0.784314,0.807339,0.909091,0.818182,0.925926,0.755556,0.672131,0.789916,0.918981,0.959459,0.791667,0.736842
validation set 3 precision,0.73913,0.972028,0.8,0.808511,0.934066,0.75,0.87234,0.828571,0.707547,0.758621,0.769841,0.898004,0.894118,0.840909,0.75
validation set 4 precision,0.648649,0.96875,0.90625,0.897959,0.895833,0.758621,0.833333,0.852941,0.804124,0.77193,0.80292,0.906725,0.927711,0.860465,0.823529


## Final senate evaluation on test data

In [21]:
beta_2_real_p = [0.99,0.999,0.9999]

Train_Y_sen = np.array([np.array(t) for t in Train_Y_sen])
Test_Y_sen = np.array([np.array(t) for t in Test_Y_sen])

model = HAN_model(len(Train_Y_sen[0]))

opt = Adam(lr = parameters["learning_rate"].iloc[0], beta_1 = parameters["beta1"].iloc[0], beta_2 = beta_2_real_p[int(parameters["beta2"].iloc[0])])
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
history = model.fit(x=np.array(Train_X), y=np.array(Train_Y_sen), epochs=21, validation_split=0.0, class_weight=classes_w_sen, batch_size=45, shuffle=False)
test_prediction = model.predict(np.array(Test_X))
print("Senado - HAN tuned")
print(classification_report(y_true=np.array(Test_Y_sen), y_pred=predict_classes(test_prediction), digits=3, target_names=sen_classes_names))

Epoch 1/21
Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21
Epoch 6/21
Epoch 7/21
Epoch 8/21
Epoch 9/21
Epoch 10/21
Epoch 11/21
Epoch 12/21
Epoch 13/21
Epoch 14/21
Epoch 15/21
Epoch 16/21
Epoch 17/21
Epoch 18/21
Epoch 19/21
Epoch 20/21
Epoch 21/21
Senado - HAN tuned
                                                                    precision    recall  f1-score   support

                                                             Saúde      0.710     0.733     0.721        30
                                               Relações Exteriores      0.986     0.966     0.976       294
                                                     Meio ambiente      0.833     0.882     0.857        34
                                       Educação, cultura e esporte      0.778     0.686     0.729        51
                                        Segurança Pública e Defesa      0.869     0.894     0.882       104
                                            Trabalho e Previdência      0.880     0.759  

In [20]:
#Saving final model
model.save('models/HAN_sen_model_final')
pd.DataFrame(predict_classes(test_prediction)).to_csv("test_results/HAN_sen.csv", index=None, header=None)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/HAN_sen_model_final/assets


In [24]:
for i in range (len(Test_Y_sen[0])+1):
    print(str(i)+"/"+str(len(Test_Y_sen[0]))+" classes:", acerto_absoluto(Test_Y_sen, test_prediction, i))

0/15 classes: 0.0
1/15 classes: 0.0
2/15 classes: 0.0
3/15 classes: 0.0
4/15 classes: 0.0
5/15 classes: 0.0
6/15 classes: 0.0
7/15 classes: 0.0
8/15 classes: 0.0
9/15 classes: 0.0
10/15 classes: 0.0
11/15 classes: 0.22371364653243847
12/15 classes: 1.1185682326621924
13/15 classes: 6.785980611483968
14/15 classes: 15.06338553318419
15/15 classes: 76.80835197613722


## Referenda - Otimização

In [18]:
stdout = sys.stdout
os.chdir("/home/caiocampos/andre/radar_wisemap")
sys.path.append('hypermapper/scripts/')
import hypermapper

iterations = 0

def optimize_HAN_minist(parameters):
    global iterations
    f1s = 0
    f1s_per_class = []
    precision_per_class = []
    recall_per_class = []
    beta_2_real_p = [0.99,0.999,0.9999]  # O parâmetro beta2 é traduzido de categórico para um dos três reais descritos no paper do Adam.
    
    for i in range(len(train_sets)):        
        sets_index = []
        for t in range(len(train_sets)):
            if t!=i:
                sets_index.append(t)

        train_X = np.concatenate(np.array(sets_X, dtype=object)[sets_index])
        validation_X = sets_X[i]

        train_Y_minist = np.concatenate(np.array(sets_Y_minist, dtype=object)[sets_index])
        train_Y_minist = np.array([np.array(t) for t in train_Y_minist])
        validation_Y_minist = sets_Y_minist[i]
        validation_Y_minist = np.array([np.array(v) for v in validation_Y_minist])

        model = HAN_model(len(Train_Y_minist[0]))
        opt = Adam(lr=parameters['learning_rate'], beta_1=parameters['beta1'], beta_2=beta_2_real_p[parameters['beta2']])
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
        history = model.fit(x=np.array(train_X), y=np.array(train_Y_minist), epochs=parameters['epochs'], validation_split=0.0, class_weight=classes_w_minist, batch_size=parameters['batch_size'], shuffle=False, verbose=0)
        validation_prediction = model.predict(np.array(validation_X))

        f1s+=f1_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average='macro')
        f1s_per_class.append(f1_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average=None))
        precision_per_class.append(precision_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average=None))
        recall_per_class.append(recall_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average=None))
    generate_validation_csv(f1s_per_class, precision_per_class, recall_per_class, "HAN_minist_v2", minist_classes_names)
    f1_loss=1-f1s/4
    print("Iteration "+str(iterations)+":\nlearning_rate: ", parameters['learning_rate'], " || beta1: ", parameters['beta1'], " || beta2: ", beta_2_real_p[parameters['beta2']], " || epochs: ", parameters['epochs'], " || batch_size: ", parameters['batch_size'], " || (1 - macro_F1): ", f1_loss)
    iterations+=1
    return f1_loss

os.chdir("hypermapper/")
stdout = sys.stdout
print(os.getcwd())
hypermapper.optimize("../optimization/HAN_scenario.json", optimize_HAN_minist)
os.chdir("/home/caiocampos/andre/radar_wisemap/")
sys.stdout = stdout

/home/caiocampos/andre/radar_wisemap/hypermapper
Design of experiment phase, number of doe samples = 20 .......




Iteration 0:
learning_rate:  0.061599517591355686  || beta1:  0.5106096601543278  || beta2:  0.9999  || epochs:  22  || batch_size:  42  || (1 - macro_F1):  1.0
Iteration 1:
learning_rate:  0.026570347917509094  || beta1:  0.6812362841760784  || beta2:  0.999  || epochs:  4  || batch_size:  45  || (1 - macro_F1):  0.5723655111136237
Iteration 2:
learning_rate:  0.09749756293549715  || beta1:  0.6620159003051941  || beta2:  0.99  || epochs:  12  || batch_size:  17  || (1 - macro_F1):  0.987342019035731
Iteration 3:
learning_rate:  0.03323013641474209  || beta1:  0.2641279302939091  || beta2:  0.9999  || epochs:  5  || batch_size:  30  || (1 - macro_F1):  0.9248318520750439
Iteration 4:
learning_rate:  0.027865443728216567  || beta1:  0.7524092317148126  || beta2:  0.99  || epochs:  19  || batch_size:  79  || (1 - macro_F1):  0.415843990611392
Iteration 5:
learning_rate:  0.09783954939500432  || beta1:  0.345983112285847  || beta2:  0.999  || epochs:  23  || batch_size:  34  || (1 - macr



Iteration 20:
learning_rate:  0.08662981691696064  || beta1:  0.65672438300679  || beta2:  0.9999  || epochs:  11  || batch_size:  50  || (1 - macro_F1):  1.0
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.08662981691696064,0.65672438300679,2,11,50,1.0,8569655

Starting optimization iteration 2
Iteration 21:
learning_rate:  0.005433563891625506  || beta1:  0.778660186261144  || beta2:  0.9999  || epochs:  11  || batch_size:  61  || (1 - macro_F1):  0.24094542102372973
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.005433563891625506,0.778660186261144,2,11,61,0.24094542102372973,8859850

Starting optimization iteration 3
Iteration 22:
learning_rate:  0.008381103015120234  || beta1:  0.4251389280472371  || beta2:  0.999  || epochs:  11  || batch_size:  57  || (1 - macro_F1):  0.24946354750518607
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.008381103015120234,0.4251389280472371,1,11,57,0.24946354750518607,9153180

Starting optimization ite



Iteration 25:
learning_rate:  8.969365398429023e-05  || beta1:  0.45217973681160784  || beta2:  0.999  || epochs:  3  || batch_size:  28  || (1 - macro_F1):  1.0
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
8.969365398429023e-05,0.45217973681160784,1,3,28,1.0,9861582

Starting optimization iteration 7
Iteration 26:
learning_rate:  0.0006670355079480684  || beta1:  0.9  || beta2:  0.9999  || epochs:  11  || batch_size:  66  || (1 - macro_F1):  0.3030072448946455
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0006670355079480684,0.9,2,11,66,0.3030072448946455,10146154

Starting optimization iteration 8
Iteration 27:
learning_rate:  0.004618960314542974  || beta1:  0.255781418146325  || beta2:  0.999  || epochs:  11  || batch_size:  62  || (1 - macro_F1):  0.24953025434245757
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.004618960314542974,0.255781418146325,1,11,62,0.24953025434245757,10433955

Starting optimization iteration 9
Iteration 28



Iteration 29:
learning_rate:  0.010917205730557145  || beta1:  0.9  || beta2:  0.9999  || epochs:  3  || batch_size:  71  || (1 - macro_F1):  0.3532341735150063
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.010917205730557145,0.9,2,3,71,0.3532341735150063,10699201

Starting optimization iteration 11
Iteration 30:
learning_rate:  0.007233585249118387  || beta1:  0.9  || beta2:  0.9999  || epochs:  22  || batch_size:  64  || (1 - macro_F1):  0.26540456522822875
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.007233585249118387,0.9,2,22,64,0.26540456522822875,11228716

Starting optimization iteration 12
Iteration 31:
learning_rate:  0.0062180560936290034  || beta1:  0.9  || beta2:  0.99  || epochs:  11  || batch_size:  61  || (1 - macro_F1):  0.2626805227630168
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0062180560936290034,0.9,0,11,61,0.2626805227630168,11517307

Starting optimization iteration 13
Iteration 32:
learning_rate:  0.00943367



Iteration 44:
learning_rate:  4.576726164470858e-05  || beta1:  0.9  || beta2:  0.999  || epochs:  29  || batch_size:  56  || (1 - macro_F1):  0.8756976319331538
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
4.576726164470858e-05,0.9,1,29,56,0.8756976319331538,16533806

Starting optimization iteration 26
Iteration 45:
learning_rate:  0.002501503699863985  || beta1:  0.6400557310921192  || beta2:  0.9999  || epochs:  3  || batch_size:  56  || (1 - macro_F1):  0.3050038083898542
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.002501503699863985,0.6400557310921192,2,3,56,0.3050038083898542,16646146

Starting optimization iteration 27
Iteration 46:
learning_rate:  0.010838044993777494  || beta1:  0.8846383503811964  || beta2:  0.9999  || epochs:  28  || batch_size:  60  || (1 - macro_F1):  0.26014692303000475
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.010838044993777494,0.8846383503811964,2,28,60,0.26014692303000475,17314301

Starting optim



Iteration 78:
learning_rate:  0.034888230852272564  || beta1:  0.5996127255906881  || beta2:  0.9999  || epochs:  20  || batch_size:  64  || (1 - macro_F1):  1.0
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.034888230852272564,0.5996127255906881,2,20,64,1.0,33012797

Starting optimization iteration 60
Iteration 79:
learning_rate:  0.0013003431703968867  || beta1:  0.42407623307251663  || beta2:  0.99  || epochs:  11  || batch_size:  94  || (1 - macro_F1):  0.29255636774174065
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0013003431703968867,0.42407623307251663,0,11,94,0.29255636774174065,33284715

Starting optimization iteration 61
Iteration 80:
learning_rate:  0.0030512337386958705  || beta1:  0.9  || beta2:  0.99  || epochs:  20  || batch_size:  59  || (1 - macro_F1):  0.26686955915232646
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0030512337386958705,0.9,0,20,59,0.26686955915232646,33777234

Starting optimization iteration 62
Iter



Iteration 87:
learning_rate:  0.029684467830507072  || beta1:  0.33018821068313015  || beta2:  0.9999  || epochs:  3  || batch_size:  100  || (1 - macro_F1):  0.855123296957921
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.029684467830507072,0.33018821068313015,2,3,100,0.855123296957921,36003164

Starting optimization iteration 69
Iteration 88:
learning_rate:  0.01071889744900273  || beta1:  0.7219145785873757  || beta2:  0.9999  || epochs:  29  || batch_size:  90  || (1 - macro_F1):  0.29445245454919555
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.01071889744900273,0.7219145785873757,2,29,90,0.29445245454919555,36650484

Starting optimization iteration 70
Iteration 89:
learning_rate:  0.006067389137019261  || beta1:  0.8168720133595471  || beta2:  0.999  || epochs:  3  || batch_size:  54  || (1 - macro_F1):  0.3031124294830416
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.006067389137019261,0.8168720133595471,1,3,54,0.303112429483041



Iteration 96:
learning_rate:  0.06437711313136385  || beta1:  0.9  || beta2:  0.999  || epochs:  29  || batch_size:  16  || (1 - macro_F1):  1.0
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.06437711313136385,0.9,1,29,16,1.0,41133982

Starting optimization iteration 78
Iteration 97:
learning_rate:  0.00023338506981298714  || beta1:  0.513240005077079  || beta2:  0.9999  || epochs:  29  || batch_size:  86  || (1 - macro_F1):  0.3310668325559112
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.00023338506981298714,0.513240005077079,2,29,86,0.3310668325559112,41786049

Starting optimization iteration 79




Iteration 98:
learning_rate:  0.020668523640199374  || beta1:  0.9  || beta2:  0.9999  || epochs:  26  || batch_size:  78  || (1 - macro_F1):  0.591029848467981
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.020668523640199374,0.9,2,26,78,0.591029848467981,42385231

Starting optimization iteration 80
Iteration 99:
learning_rate:  0.011060078644466382  || beta1:  0.23512768872592774  || beta2:  0.9999  || epochs:  3  || batch_size:  95  || (1 - macro_F1):  0.28678990215577094
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.011060078644466382,0.23512768872592774,2,3,95,0.28678990215577094,42491655

End of Random Scalarizations
### End of the hypermapper script.


In [30]:
optimization_results_minist = pd.read_csv("optimization/minist_han_validation_hypermapper_output_v2.csv")
best_parameters_idx_minist = optimization_results_minist["1 - F1"].idxmin()
parameters_minist = pd.DataFrame(optimization_results_minist.iloc[best_parameters_idx_minist]).transpose()
parameters_minist

Unnamed: 0,learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
21,0.005434,0.77866,2.0,11.0,61.0,0.240945,8859850.0


In [31]:
validation_results_minist = pd.read_csv("optimization/validation_results/HAN_minist_v2iteration_"+str(best_parameters_idx_minist)+".csv", index_col=0)
validation_results_minist

Unnamed: 0,Saúde,Relações Exteriores,Meio ambiente,"Educação, cultura e esporte",Justiça e Segurança,Trabalho e Previdência,Transporte/transporte de mercadorias,"Agricultura, pecuária e pesca",Ciência e tecnologia,Social,Presidência,Economia e planejamento,"Indústria, comércio, obras públicas, turismo"
validation set 1 f1,0.805556,0.951261,0.791667,0.820809,0.815217,0.689655,0.695652,0.8,0.747664,0.666667,0.533708,0.885737,0.74026
validation set 2 f1,0.676056,0.937815,0.777778,0.780488,0.808023,0.725664,0.779661,0.76,0.758065,0.787402,0.521212,0.877551,0.716867
validation set 3 f1,0.777778,0.939799,0.87234,0.814815,0.798883,0.768,0.675676,0.694737,0.735849,0.7,0.543807,0.861044,0.715152
validation set 4 f1,0.740741,0.956229,0.795918,0.814371,0.813953,0.730159,0.557377,0.763636,0.722222,0.692913,0.553672,0.872473,0.708861
validation sets mean f1,0.750033,0.946276,0.809426,0.807621,0.809019,0.728369,0.677091,0.754593,0.74095,0.711745,0.5381,0.874201,0.720285
validation std f1,0.048506,0.007705,0.036938,0.01587,0.006452,0.027748,0.07937,0.037933,0.013367,0.045411,0.012037,0.00895,0.011912
validation set 1 precision,0.90625,0.969178,0.808511,0.865854,0.837989,0.689655,0.705882,0.9,0.869565,0.691176,0.549133,0.903537,0.814286
validation set 2 precision,0.8,0.962069,0.897436,0.864865,0.849398,0.759259,0.821429,0.863636,0.770492,0.877193,0.585034,0.8944,0.748428
validation set 3 precision,0.848485,0.952542,0.87234,0.916667,0.841176,0.761905,0.657895,0.767442,0.847826,0.710145,0.604027,0.899329,0.771242
validation set 4 precision,0.789474,0.972603,0.8125,0.871795,0.903226,0.730159,0.566667,0.724138,0.764706,0.8,0.579882,0.886256,0.761905


## Final "referenda" evaluation on test data

In [27]:
beta_2_real_p = [0.99,0.999,0.9999]

Train_Y_minist = np.array([np.array(t) for t in Train_Y_minist])
Test_Y_minist = np.array([np.array(t) for t in Test_Y_minist])

model = HAN_model(len(Train_Y_minist[0]))

opt = Adam(lr = parameters_minist["learning_rate"].iloc[0], beta_1 = parameters_minist["beta1"].iloc[0], beta_2 = beta_2_real_p[int(parameters_minist["beta2"].iloc[0])])
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
history = model.fit(x=np.array(Train_X), y=np.array(Train_Y_minist), epochs=int(parameters_minist["epochs"].iloc[0]), validation_split=0.0, class_weight=classes_w_minist, batch_size=int(parameters_minist["batch_size"].iloc[0]), shuffle=False)
test_prediction = model.predict(np.array(Test_X))
print("Referenda - HAN tuned")
print(classification_report(y_true=np.array(Test_Y_minist), y_pred=predict_classes(test_prediction), digits=3, target_names=minist_classes_names))

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
Referenda - HAN tuned
                                              precision    recall  f1-score   support

                                       Saúde      0.659     0.643     0.651        42
                         Relações Exteriores      0.963     0.935     0.949       308
                               Meio ambiente      0.889     0.696     0.780        46
                 Educação, cultura e esporte      0.889     0.719     0.795        89
                         Justiça e Segurança      0.801     0.754     0.777       187
                      Trabalho e Previdência      0.719     0.695     0.707        59
        Transporte/transporte de mercadorias      0.722     0.867     0.788        30
               Agricultura, pecuária e pesca      0.846     0.611     0.710        54
                        Ciência e tecnologia      0.804     0.683     0.739     

  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
#Saving final model
model.save('models/HAN_minist_model_final')
pd.DataFrame(predict_classes(test_prediction)).to_csv("test_results/HAN_minist.csv", index=None, header=None)

INFO:tensorflow:Assets written to: models/HAN_minist_model_final/assets


In [35]:
for i in range (len(Test_Y_minist[0])+1):
    print(str(i)+"/"+str(len(Test_Y_minist[0]))+" classes:", acerto_absoluto(Test_Y_minist, test_prediction, i))

0/13 classes: 0.0
1/13 classes: 0.0
2/13 classes: 0.0
3/13 classes: 0.0
4/13 classes: 0.0
5/13 classes: 0.07457121551081282
6/13 classes: 0.07457121551081282
7/13 classes: 0.0
8/13 classes: 0.2982848620432513
9/13 classes: 0.5219985085756897
10/13 classes: 2.460850111856823
11/13 classes: 10.216256524981358
12/13 classes: 21.327367636092468
13/13 classes: 65.02609992542878


In [31]:
f1s_bootstrap = []
Test_X_array = np.array(Test_X)
Test_Y_array = np.array(Test_Y_minist)

for i in range(1000):
    print(i)
    bootstraped_test_x_index = np.random.choice(range(0,len(Test_X)), size=len(Test_X), replace=True)
    bootstraped_test_x = np.array([Test_X_array[j] for j in bootstraped_test_x_index])
    bootstraped_test_y = np.array([Test_Y_array[j] for j in bootstraped_test_x_index])
    test_prediction = model.predict(bootstraped_test_x)
    f1s_bootstrap.append(f1_score(y_true=np.array(bootstraped_test_y), y_pred=predict_classes(test_prediction), average='macro'))
f1s_bootstrap

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

[0.7535569949238192,
 0.755835500188393,
 0.7481494309729033,
 0.749235024049083,
 0.7538622090395753,
 0.7590434066814411,
 0.7558045958747882,
 0.7585466624916026,
 0.7489860697348474,
 0.7592917164911888,
 0.7438097342916463,
 0.7524325720285726,
 0.7385083521044803,
 0.7335889848301598,
 0.7772116088387035,
 0.7646975215748246,
 0.7321008029375116,
 0.7215871434824174,
 0.7444574700062662,
 0.7404383389096297,
 0.7600252793224386,
 0.730226655818043,
 0.7659975631917183,
 0.7569556921978386,
 0.7351977827054008,
 0.7341392159050214,
 0.748006490978531,
 0.7442362013895227,
 0.7592114717522389,
 0.7226346815412149,
 0.7446999852111242,
 0.7573149034222215,
 0.7415480722129904,
 0.7406342451536787,
 0.7674022154970284,
 0.7440724547784957,
 0.7416842587812505,
 0.7634625398738092,
 0.7496193643267136,
 0.7324258077848971,
 0.7599136708905003,
 0.7469368613669727,
 0.7432949181583728,
 0.7707679728803049,
 0.7299120652524264,
 0.7575447672713901,
 0.7206247676611154,
 0.75167454962286

In [51]:
from matplotlib import pyplot as plt
print(np.percentile(f1s_bootstrap, 2.5))
print(np.percentile(f1s_bootstrap, 97.5), "\n")
print(np.array(f1s_bootstrap).mean() - np.array(f1s_bootstrap).std(ddof=1)*1.96)
print(np.array(f1s_bootstrap).mean() + np.array(f1s_bootstrap).std(ddof=1)*1.96)

0.7248777801183908
0.7771780962013661 

0.7266743015756976
0.7774996621403694


In [42]:
print(np.array(f1s_bootstrap).mean(), "+-", np.array(f1s_bootstrap).std()*1.96)

0.7520869818580335 +- 0.025399970764020383


## Visualization of HAN attetion weights

In [24]:
from IPython.core.display import HTML
from keras.models import Sequential

# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Replicates the Attention mechanism and returns the weight coefficient for a given word or sentence.
def wordAttentionWeights(sequenceSentence,weights):
    """
    The same function as the AttentionLayer class.
    """
    uit = np.dot(sequenceSentence, weights[0]) + weights[1]
    uit = np.tanh(uit)

    ait = np.dot(uit, weights[2])
    ait = np.squeeze(ait)
    ait = np.exp(ait)
    ait /= np.sum(ait)
    
    return ait

def from_y_to_class_names(indices, classes_names):
    names = ""
    for i, j in enumerate(indices):
        if(j == 1):
            names+=classes_names[i]+" | "
    return names

In [21]:
test_n = 1261 # Selects the number of the document in the test set

document_test = np.array(list(map(sequence_to_text, Test_X[test_n])))

# Word submodel
hidden_word_encoding_out = Model(inputs=model.get_layer("sent_linking").layer.input, outputs=model.get_layer("sent_linking").layer.get_layer("word_gru").output)
word_context = model.get_layer("sent_linking").layer.get_layer("attention_with_context").get_weights()  #weight, biass, u

# Sentence submodel
hidden_sentence_encoding_out = Sequential()
for layer in model.layers[:-2]:
    hidden_sentence_encoding_out.add(layer)
# print(hidden_sentence_encoding_out.summary())
sentence_context = model.get_layer("attention_with_context_1").get_weights()

hidden_sentence_encodings = hidden_sentence_encoding_out.predict(np.array([Test_X[test_n]]))
ait_sentence = wordAttentionWeights(hidden_sentence_encodings,sentence_context)

# Visualization
not_none_index = []
not_none_sentences_index = []
full_phrases = []
full_ait_words = []
for j in range(len(ait_sentence)):
    hidden_word_encodings = hidden_word_encoding_out.predict(np.array([Test_X[test_n][j]]))
    ait_word = wordAttentionWeights(hidden_word_encodings,word_context)
    if(np.sum(document_test[j] == None) != len(document_test[j])):  # checks if it's not a full none sentence.
        not_none_index = [i for i in range(len(document_test[j])) if document_test[j][i] != None]
        ait_word*=np.sqrt(ait_sentence[j])
        ait_word = ait_word[not_none_index]  
        phrase = document_test[j][not_none_index]

        full_phrases.append(phrase)
        full_ait_words.append(ait_word)

        not_none_sentences_index.append(j)

full_ait_words_flatten = []
for i in full_ait_words:
    for j in i:
        full_ait_words_flatten.append(j)
full_ait_words_flatten = np.array(full_ait_words_flatten)
important_indexes = full_ait_words_flatten.argsort()[::-1]

most_important_sentence = ait_sentence[not_none_sentences_index].argsort()[::-1][0]

show_important_words = 40   # first n important words to show

full_document = ""
counter = 0
counter_phrase = 0
for i in full_phrases:
    if(counter_phrase == most_important_sentence):
        full_document+=("<span style='background: rgba(255,255,0,0.5)'>")

    for j in i:
        if(counter not in important_indexes[:show_important_words]):
            full_document+=(" "+j)
        else:
            transparency = 1-((1/show_important_words)*0.8)*np.where(important_indexes==counter)[0][0]
            full_document+=(" <span style='background: rgba(100,200,100,"+str(transparency)+")'>"+j+"</span>")
        counter+=1
    full_document+="."

    if(counter_phrase == most_important_sentence):
        full_document+=("</span>")
    counter_phrase+=1

print("Decreto de índice "+str(test_n)+" do dataset de testes:\n")
print("Expected Classification:",from_y_to_class_names(Test_Y_sen[test_n], sen_classes_names))
print("Predicted Classification:",from_y_to_class_names(predict_classes(test_prediction)[test_n], sen_classes_names),"\n")

display(HTML(full_document))

Decreto de índice 1261 do dataset de testes:

Expected Classification: Saúde | 
Predicted Classification: Saúde |  

