In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=6144)])
logical_gpus = tf.config.experimental.list_logical_devices('GPU')

In [2]:
import sys
import os

SEED = 42
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['HOROVOD_FUSION_THRESHOLD']='0'
os.environ['PYTHONHASHSEED']=str(SEED)

import tfdeterminism
import random
import numpy as np
import tensorflow as tf
import skmultilearn
import pandas as pd
from sklearn.metrics import classification_report, f1_score
import numpy as np
from sklearn import model_selection

from keras.preprocessing.sequence import pad_sequences

In [3]:
print ("Tensorflow version:", tf.__version__)
print ("Numpy version:", np.__version__)
print ("tfdeterminism version", tfdeterminism.__version__)
print ("skmultilearn version", "0.2.0")

Tensorflow version: 2.2.0
Numpy version: 1.19.2
tfdeterminism version 0.3.0
skmultilearn version 0.2.0


In [4]:
# !git clone https://github.com/luinardi/hypermapper.git
test_set = pd.read_csv('sets/Set 0.csv')
set_1 = pd.read_csv('sets/Set 1.csv')
set_2 = pd.read_csv('sets/Set 2.csv')
set_3 = pd.read_csv('sets/Set 3.csv')
set_4 = pd.read_csv('sets/Set 4.csv')


def string_to_array_serie(serie):
    return [[int(i) for i in t.replace("[", "").replace("]", "").split()] for t in serie]

test_set["Senado"] = string_to_array_serie(test_set["Senado"])
set_1["Senado"] = string_to_array_serie(set_1["Senado"])
set_2["Senado"] = string_to_array_serie(set_2["Senado"])
set_3["Senado"] = string_to_array_serie(set_3["Senado"])
set_4["Senado"] = string_to_array_serie(set_4["Senado"])

test_set["Referenda"] = string_to_array_serie(test_set["Referenda"])
set_1["Referenda"] = string_to_array_serie(set_1["Referenda"])
set_2["Referenda"] = string_to_array_serie(set_2["Referenda"])
set_3["Referenda"] = string_to_array_serie(set_3["Referenda"])
set_4["Referenda"] = string_to_array_serie(set_4["Referenda"])

In [5]:
train_sets = [set_1, set_2, set_3, set_4]

full_train = pd.concat(train_sets)
full_train

Unnamed: 0,Numero ato,Ementa + texto completo,Senado,Referenda
0,10095,dispõe sobre o comitê consultivo de nanotecnol...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,10092,promulga o protocolo de integração educativa e...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,10085,dispõe sobre o programa forças no esporte segu...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]"
3,10081,altera o decreto n 8713 de 15 de abril de 2016...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]"
4,10083,autoriza o emprego das forças armadas na garan...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
1323,3336,dá nova redação aos arts 11 15 16 19 e 30 do...,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
1324,3330,dispõe sobre a redução do consumo de energia e...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1325,3338,aprova a estrutura regimental e o quadro demon...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]"
1326,3328,altera o decreto 2889 de 21 12 1998 que dispõe...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [6]:
from keras.preprocessing.text import Tokenizer

# The maximum number of words to be used. (most frequent)
# Max number of words in each complaint. Based on 90% percentile.
MAX_SEQUENCE_LENGTH = 1742

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, oov_token="OOV")
tokenizer.fit_on_texts(full_train["Ementa + texto completo"])
word_index = tokenizer.word_index
MAX_NB_WORDS = len(word_index)+1
print('Found %s unique tokens.' % len(word_index))

Train_X = tokenizer.texts_to_sequences(full_train["Ementa + texto completo"])
Test_X = tokenizer.texts_to_sequences(test_set["Ementa + texto completo"])
Train_X = pad_sequences(Train_X, maxlen=MAX_SEQUENCE_LENGTH)
Test_X = pad_sequences(Test_X, maxlen=MAX_SEQUENCE_LENGTH)

# The following sets will be used for validation.
set_1_X = tokenizer.texts_to_sequences(set_1["Ementa + texto completo"])
set_2_X = tokenizer.texts_to_sequences(set_2["Ementa + texto completo"])
set_3_X = tokenizer.texts_to_sequences(set_3["Ementa + texto completo"])
set_4_X = tokenizer.texts_to_sequences(set_4["Ementa + texto completo"])

set_1_X = pad_sequences(set_1_X, maxlen=MAX_SEQUENCE_LENGTH)
set_2_X = pad_sequences(set_2_X, maxlen=MAX_SEQUENCE_LENGTH)
set_3_X = pad_sequences(set_3_X, maxlen=MAX_SEQUENCE_LENGTH)
set_4_X = pad_sequences(set_4_X, maxlen=MAX_SEQUENCE_LENGTH)

sets_X = [set_1_X, set_2_X, set_3_X, set_4_X]

Train_Y_sen = full_train["Senado"].values
Test_Y_sen = test_set["Senado"].values
Train_Y_minist = full_train["Referenda"].values
Test_Y_minist = test_set["Referenda"].values

Found 36568 unique tokens.


In [7]:
#defining class_weights
def get_class_weights(n_classes, labels):
    counters = np.zeros(n_classes)
    for label in labels:
        for i in range(len(label)):
            if(label[i] == 1):
                counters[i]+=1
    greater_class = counters[np.argmax(counters)]
    weights = []
    for i in range(n_classes):
        weights.append(greater_class/counters[i])
    return weights

classes_w_sen = get_class_weights(len(Train_Y_sen[0]), Train_Y_sen)
classes_w_sen = {v: k for v, k in enumerate(classes_w_sen)}
classes_w_minist = get_class_weights(len(Train_Y_minist[0]), Train_Y_minist)
classes_w_minist = {v: k for v, k in enumerate(classes_w_minist)}

In [8]:
set_1_Y_sen = set_1["Senado"].values
set_2_Y_sen = set_2["Senado"].values
set_3_Y_sen = set_3["Senado"].values
set_4_Y_sen = set_4["Senado"].values
sets_Y_sen = [set_1_Y_sen, set_2_Y_sen, set_3_Y_sen, set_4_Y_sen]

set_1_Y_minist = set_1["Referenda"].values
set_2_Y_minist = set_2["Referenda"].values
set_3_Y_minist = set_3["Referenda"].values
set_4_Y_minist = set_4["Referenda"].values
sets_Y_minist = [set_1_Y_minist, set_2_Y_minist, set_3_Y_minist, set_4_Y_minist]

In [9]:
from keras.layers import Embedding, GlobalMaxPooling1D, Conv1D, Dense 
from keras import backend as K
from keras import optimizers
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical

In [10]:
#pre-trained Glove embedding
embeddings_index = {}
f = open('glove_s100.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(word, "could not be embedded.")
f.close()

print('\nFound %s word vectors.\n' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index.items())+1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
            embedding_matrix[i] = embedding_vector
        except:
            print(word, "could not be indexed.")

r$ could not be embedded.
00 could not be embedded.
三藏法師玄奘奉 could not be embedded.
r$ could not be embedded.

Found 929594 word vectors.

0 could not be indexed.
00 could not be indexed.


In [11]:
#custom function for training
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [12]:
from sklearn.metrics import classification_report
def predict_classes(probs, tresh=0.5, consider_first=False):
    predicted_classes = []
    for prob in probs:
        predicted_class = np.zeros(len(probs[0])).astype(int)
        for i in range(len(prob)):
            if(prob[i]>=tresh):
                predicted_class[i] = 1
        if np.sum(predicted_class) == 0 and consider_first:
            position_1 = (np.argsort(prob)[::-1])[0]
            predicted_class[position_1] = 1
        predicted_classes.append(predicted_class.tolist())
    
    return predicted_classes

In [13]:
def acerto_absoluto(test_y_sen, test_pred, number_labels):
    acerto_100 = 0
    test_prediction_binary = predict_classes(test_pred)
    for i in range(len(test_prediction_binary)):
        acertos_internos = 0
        for j in range(len(test_prediction_binary[i])):
            if test_y_sen[i][j] == test_prediction_binary[i][j]:
                acertos_internos+=1
        if acertos_internos == number_labels:
            acerto_100+=1
    return acerto_100/len(test_prediction_binary)*100

In [14]:
from sklearn.metrics import f1_score, precision_score, recall_score
from keras.optimizers import Adam

def CNN_model(number_of_classes):
    K.clear_session()

    os.environ['PYTHONHASHSEED']=str(SEED)
    np.random.seed(SEED)
    random.seed(SEED)
    
    tf.random.set_seed(SEED)
    os.environ['TF_CUDNN_DETERMINISM']='1'
    os.environ['TF_DETERMINISTIC_OPS']='1'
    os.environ['HOROVOD_FUSION_THRESHOLD']='0'
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    
    modelCNN = Sequential()
    modelCNN.add(Embedding(MAX_NB_WORDS, 100, weights=[embedding_matrix], input_shape=(Train_X.shape[1], ) ))
    modelCNN.add(Conv1D(256, 4, padding='valid', strides=1))    #activation='relu',
    modelCNN.add(GlobalMaxPooling1D())

    modelCNN.add(Dense(number_of_classes, activation='sigmoid', input_shape=(Train_X.shape[1], )))
    return modelCNN

## Hyperparameter optimization with bayesian optimization and Cross-Validation

In [15]:
sen_classes_names = ["Saúde", "Relações Exteriores", "Meio ambiente", "Educação, cultura e esporte", "Segurança Pública e Defesa", "Trabalho e Previdência", "Agricultura, pecuária e pesca", "Ciência, tecnologia e comunicações", "Social", "Indústria, comércio, turismo, transporte/transporte de mercadorias", "Economia, planejamento e sistema financeiro", "Assuntos internos,  cargos e comissões, Estado", "Tributos", "Minas e Energia", "Justiça e direitos"]
minist_classes_names = ["Saúde", "Relações Exteriores", "Meio ambiente", "Educação, cultura e esporte", "Justiça e Segurança", "Trabalho e Previdência", "Transporte/transporte de mercadorias", "Agricultura, pecuária e pesca", "Ciência e tecnologia", "Social", "Presidência", "Economia e planejamento", "Indústria, comércio, obras públicas, turismo"]
stdout = sys.stdout

In [16]:
def generate_validation_csv(f1s_per_class, precision_per_class, recall_per_class, prefix, names):
    final_metrics = (f1s_per_class, np.array([np.mean(f1s_per_class, axis=0)]), np.array([np.std(f1s_per_class, axis=0, ddof=1)]),
                     precision_per_class, np.array([np.mean(precision_per_class, axis=0)]), np.array([np.std(precision_per_class, axis=0, ddof=1)]),
                     recall_per_class, np.array([np.mean(recall_per_class, axis=0)]), np.array([np.std(recall_per_class, axis=0, ddof=1)]) )
    f1s_per_class_pd = pd.DataFrame(np.concatenate(final_metrics, axis=0))
    f1s_per_class_pd.columns = names
    f1s_per_class_pd.index=["validation set 1 f1", "validation set 2 f1", "validation set 3 f1", "validation set 4 f1", "validation sets mean f1", "validation std f1",
                            "validation set 1 precision", "validation set 2 precision", "validation set 3 precision", "validation set 4 precision", "validation sets mean precision", "validation std precision",
                            "validation set 1 recall", "validation set 2 recall", "validation set 3 recall", "validation set 4 recall", "validation sets mean recall", "validation std recall"]
    f1s_per_class_pd.to_csv("../optimization/validation_results/"+prefix+"iteration_"+str(iterations)+".csv", index=True, header=True)

## Senado - otimização

In [48]:
import os
import sys
os.chdir("/home/caiocampos/andre/radar_wisemap")
sys.path.append('hypermapper/scripts/')
import hypermapper

iterations = 0

def optimize_CNN(parameters):
    global iterations
    f1s = 0
    f1s_per_class = []
    precision_per_class = []
    recall_per_class = []
    beta_2_real_p = [0.99,0.999,0.9999]  # O parâmetro beta2 é traduzido de categórico para um dos três reais descritos no paper do Adam.
    
    for i in range(len(train_sets)):        
        sets_index = []
        for t in range(len(train_sets)):
            if t!=i:
                sets_index.append(t)

        train_X = np.concatenate(np.array(sets_X, dtype=object)[sets_index])
        validation_X = sets_X[i]

        train_Y_sen = np.concatenate(np.array(sets_Y_sen, dtype=object)[sets_index])
        train_Y_sen = np.array([np.array(t) for t in train_Y_sen])
        validation_Y_sen = sets_Y_sen[i]
        validation_Y_sen = np.array([np.array(v) for v in validation_Y_sen])

        model = CNN_model(len(Train_Y_sen[0]))
        opt = Adam(lr=parameters['learning_rate'], beta_1=parameters['beta1'], beta_2=beta_2_real_p[parameters['beta2']])
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
        history = model.fit(x=np.array(train_X), y=np.array(train_Y_sen), epochs=parameters['epochs'], validation_split=0.0, class_weight=classes_w_sen, batch_size=parameters['batch_size'], shuffle=False, verbose=0)
        validation_prediction = model.predict(np.array(validation_X))

        f1s+=f1_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average='macro')
        f1s_per_class.append(f1_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average=None))
        precision_per_class.append(precision_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average=None))
        recall_per_class.append(recall_score(y_true=validation_Y_sen, y_pred=predict_classes(validation_prediction), average=None))
    generate_validation_csv(f1s_per_class, precision_per_class, recall_per_class, "CNN_sen_", sen_classes_names)
    f1_loss=1-f1s/4
    print("Iteration "+str(iterations)+":\nlearning_rate: ", parameters['learning_rate'], " || beta1: ", parameters['beta1'], " || beta2: ", beta_2_real_p[parameters['beta2']], " || epochs: ", parameters['epochs'], " || batch_size: ", parameters['batch_size'], " || (1 - macro_F1): ", f1_loss)
    iterations+=1
    return f1_loss

os.chdir("hypermapper/")
stdout = sys.stdout
print(os.getcwd())
hypermapper.optimize("../optimization/HAN_scenario.json", optimize_CNN)
os.chdir("/home/caiocampos/andre/radar_wisemap/")
sys.stdout = stdout

/home/caiocampos/andre/radar_wisemap/hypermapper
Design of experiment phase, number of doe samples = 20 .......
Iteration 0:
learning_rate:  0.010821850820462157  || beta1:  0.4163350445266138  || beta2:  0.9999  || epochs:  26  || batch_size:  60  || (1 - macro_F1):  0.3014176354276874
Iteration 1:
learning_rate:  0.05929761628328129  || beta1:  0.6676481704907524  || beta2:  0.9999  || epochs:  24  || batch_size:  40  || (1 - macro_F1):  0.35716884271778837
Iteration 2:
learning_rate:  0.07144366460330498  || beta1:  0.8630036628544978  || beta2:  0.9999  || epochs:  13  || batch_size:  87  || (1 - macro_F1):  0.3862791445049355
Iteration 3:
learning_rate:  0.07021151133728837  || beta1:  0.4408026454652195  || beta2:  0.999  || epochs:  13  || batch_size:  96  || (1 - macro_F1):  0.39576656853196224




Iteration 4:
learning_rate:  0.06205409811911996  || beta1:  0.2522084509659347  || beta2:  0.999  || epochs:  19  || batch_size:  72  || (1 - macro_F1):  0.3667628551645623
Iteration 5:
learning_rate:  0.03291228173711074  || beta1:  0.3498055796988398  || beta2:  0.999  || epochs:  24  || batch_size:  69  || (1 - macro_F1):  0.34234761116240786
Iteration 6:
learning_rate:  0.05306306887607827  || beta1:  0.11338624999980179  || beta2:  0.9999  || epochs:  25  || batch_size:  70  || (1 - macro_F1):  0.3361796947501915
Iteration 7:
learning_rate:  0.002315257340789781  || beta1:  0.5463214745303601  || beta2:  0.99  || epochs:  11  || batch_size:  55  || (1 - macro_F1):  0.28675756726237467
Iteration 8:
learning_rate:  0.09635990804263393  || beta1:  0.09220223607071937  || beta2:  0.9999  || epochs:  25  || batch_size:  31  || (1 - macro_F1):  0.36362032384568455
Iteration 9:
learning_rate:  0.08699346701801745  || beta1:  0.611455400645749  || beta2:  0.999  || epochs:  5  || batch_s



Iteration 22:
learning_rate:  4.576726164470858e-05  || beta1:  0.07144991482257843  || beta2:  0.99  || epochs:  14  || batch_size:  57  || (1 - macro_F1):  0.9123502582870118
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
4.576726164470858e-05,0.07144991482257843,0,14,57,0.9123502582870118,2556190

Starting optimization iteration 4
Iteration 23:
learning_rate:  0.04594314893742181  || beta1:  0.008812215011213863  || beta2:  0.99  || epochs:  11  || batch_size:  55  || (1 - macro_F1):  0.40584758071796967
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.04594314893742181,0.008812215011213863,0,11,55,0.40584758071796967,2630728

Starting optimization iteration 5
Iteration 24:
learning_rate:  0.021731531906274085  || beta1:  0.6790398230366149  || beta2:  0.99  || epochs:  11  || batch_size:  55  || (1 - macro_F1):  0.35027699060135264
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.021731531906274085,0.6790398230366149,0,11,55,0.3502769906013



Iteration 28:
learning_rate:  4.102427215651248e-05  || beta1:  0.5138020746211978  || beta2:  0.9999  || epochs:  27  || batch_size:  16  || (1 - macro_F1):  0.5959510188715047
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
4.102427215651248e-05,0.5138020746211978,2,27,16,0.5959510188715047,3330861

Starting optimization iteration 10
Iteration 29:
learning_rate:  0.08074634856021355  || beta1:  0.4700588224386643  || beta2:  0.99  || epochs:  11  || batch_size:  48  || (1 - macro_F1):  0.3986938543852563
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.08074634856021355,0.4700588224386643,0,11,48,0.3986938543852563,3408252

Starting optimization iteration 11
Iteration 30:
learning_rate:  0.0021123354116224175  || beta1:  0.5781090647274086  || beta2:  0.999  || epochs:  11  || batch_size:  67  || (1 - macro_F1):  0.29335354856196216
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0021123354116224175,0.5781090647274086,1,11,67,0.29335354856196



Iteration 38:
learning_rate:  0.00026067590815932794  || beta1:  0.8485274239748573  || beta2:  0.999  || epochs:  6  || batch_size:  67  || (1 - macro_F1):  0.7405388492035929
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.00026067590815932794,0.8485274239748573,1,6,67,0.7405388492035929,4250645

Starting optimization iteration 20
Iteration 39:
learning_rate:  0.002315257340789781  || beta1:  0.7808162580389951  || beta2:  0.99  || epochs:  11  || batch_size:  55  || (1 - macro_F1):  0.3028871764034349
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.002315257340789781,0.7808162580389951,0,11,55,0.3028871764034349,4324115

Starting optimization iteration 21
Iteration 40:
learning_rate:  0.004595136647156551  || beta1:  0.6112127135974088  || beta2:  0.999  || epochs:  12  || batch_size:  67  || (1 - macro_F1):  0.28362435705598266
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.004595136647156551,0.6112127135974088,1,12,67,0.283624357055982

learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.01906802619156001,0.7423588834270647,0,17,59,0.3244797343614092,8581497

Starting optimization iteration 67
Iteration 86:
learning_rate:  0.003280463775228364  || beta1:  0.3050769280598608  || beta2:  0.999  || epochs:  23  || batch_size:  59  || (1 - macro_F1):  0.29014542370709984
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.003280463775228364,0.3050769280598608,1,23,59,0.29014542370709984,8721724

Starting optimization iteration 68
Iteration 87:
learning_rate:  0.02988682676410231  || beta1:  0.9  || beta2:  0.999  || epochs:  23  || batch_size:  59  || (1 - macro_F1):  0.32666209851080574
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.02988682676410231,0.9,1,23,59,0.32666209851080574,8862630

Starting optimization iteration 69
Iteration 88:
learning_rate:  0.0031521278441019713  || beta1:  0.8856304247869142  || beta2:  0.999  || epochs:  29  || batch_size:  59  || (1 - macro_F1):  0.2



Iteration 98:
learning_rate:  0.019770342406807095  || beta1:  0.9  || beta2:  0.999  || epochs:  3  || batch_size:  60  || (1 - macro_F1):  0.3805687244384577
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.019770342406807095,0.9,1,3,60,0.3805687244384577,10205915

Starting optimization iteration 80
Iteration 99:
learning_rate:  0.00662698329861312  || beta1:  0.10258163889986889  || beta2:  0.999  || epochs:  12  || batch_size:  67  || (1 - macro_F1):  0.29092637322150416
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.00662698329861312,0.10258163889986889,1,12,67,0.29092637322150416,10279788

End of Random Scalarizations
### End of the hypermapper script.


In [16]:
optimization_results = pd.read_csv("optimization/sen_cnn_validation_hypermapper_output.csv")
best_parameters_idx = optimization_results["1 - F1"].idxmin()
parameters = pd.DataFrame(optimization_results.iloc[best_parameters_idx]).transpose()
parameters

Unnamed: 0,learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
68,0.00839,0.256272,1.0,11.0,55.0,0.274454,7111858.0


In [17]:
validation_results_sen = pd.read_csv("optimization/validation_results/CNN_sen_iteration_"+str(best_parameters_idx)+".csv", index_col=0)
validation_results_sen

Unnamed: 0,Saúde,Relações Exteriores,Meio ambiente,"Educação, cultura e esporte",Segurança Pública e Defesa,Trabalho e Previdência,"Agricultura, pecuária e pesca","Ciência, tecnologia e comunicações",Social,"Indústria, comércio, turismo, transporte/transporte de mercadorias","Economia, planejamento e sistema financeiro","Assuntos internos, cargos e comissões, Estado",Tributos,Minas e Energia,Justiça e direitos
validation set 1 f1,0.638298,0.957895,0.707692,0.783505,0.829268,0.8,0.683544,0.655172,0.621469,0.565657,0.732143,0.83592,0.866242,0.732394,0.393443
validation set 2 f1,0.536585,0.961268,0.724138,0.727273,0.857143,0.851064,0.666667,0.701754,0.715026,0.592593,0.692641,0.828375,0.864198,0.820513,0.514286
validation set 3 f1,0.4,0.964912,0.676056,0.680851,0.788177,0.916667,0.75,0.709677,0.618785,0.55914,0.737288,0.862486,0.881988,0.849315,0.507463
validation set 4 f1,0.666667,0.954225,0.760563,0.744681,0.824121,0.716981,0.641975,0.62069,0.702222,0.486486,0.710204,0.824053,0.879518,0.853333,0.38806
validation sets mean f1,0.560387,0.959575,0.717112,0.734077,0.824677,0.821178,0.685547,0.671823,0.664375,0.550969,0.718069,0.837709,0.872986,0.813889,0.450813
validation std f1,0.120632,0.004575,0.035174,0.042563,0.028329,0.084295,0.046235,0.041716,0.051372,0.045362,0.020623,0.017231,0.009063,0.056262,0.069444
validation set 1 precision,0.714286,0.98556,0.766667,0.808511,0.858586,0.857143,0.84375,0.863636,0.733333,0.717949,0.953488,0.90625,0.971429,0.866667,0.857143
validation set 2 precision,0.611111,0.99635,0.84,0.864865,0.887755,0.952381,0.923077,0.869565,0.775281,0.695652,0.860215,0.94026,0.921053,0.820513,0.818182
validation set 3 precision,0.615385,0.996377,0.727273,0.727273,0.833333,0.956522,0.9375,0.785714,0.788732,0.928571,0.878788,0.926714,0.946667,0.911765,0.85
validation set 4 precision,0.73913,0.985455,0.75,0.853659,0.891304,0.655172,0.764706,0.818182,0.637097,0.5625,0.798165,0.929648,0.960526,0.914286,0.684211


## Final senate evaluation on test data

In [19]:
beta_2_real_p = [0.99,0.999,0.9999]

Train_Y = np.array([np.array(t) for t in Train_Y_sen])
Test_Y = np.array([np.array(t) for t in Test_Y_sen])

model = CNN_model(len(Train_Y_sen[0]))

opt = Adam(lr = parameters["learning_rate"].iloc[0], beta_1 = parameters["beta1"].iloc[0], beta_2 = beta_2_real_p[int(parameters["beta2"].iloc[0])])
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
history = model.fit(x=np.array(Train_X), y=np.array(Train_Y), epochs=int(parameters["epochs"].iloc[0]), validation_split=0.0, class_weight=classes_w_sen, batch_size=int(parameters["batch_size"].iloc[0]), shuffle=False)
test_prediction = model.predict(np.array(Test_X))
print("Senado - CNN tuned")
print(classification_report(y_true=np.array(Test_Y), y_pred=predict_classes(test_prediction), digits=3, target_names=sen_classes_names))

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
Senado - CNN tuned
                                                                    precision    recall  f1-score   support

                                                             Saúde      0.762     0.533     0.627        30
                                               Relações Exteriores      0.986     0.959     0.972       294
                                                     Meio ambiente      0.889     0.706     0.787        34
                                       Educação, cultura e esporte      0.854     0.686     0.761        51
                                        Segurança Pública e Defesa      0.928     0.865     0.896       104
                                            Trabalho e Previdência      0.821     0.793     0.807        29
                                     Agricultura, pecuária e pesca      0.933     0.636     0.757    

In [20]:
#Saving final model
model.save('models/CNN_sen_model_final')
pd.DataFrame(predict_classes(test_prediction)).to_csv("test_results/CNN_sen.csv", index=None, header=None)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: models/CNN_sen_model_final/assets


In [21]:
for i in range (len(Test_Y_sen[0])+1):
    print(str(i)+"/"+str(len(Test_Y_sen[0]))+" classes:", acerto_absoluto(Test_Y_sen, test_prediction, i))

0/15 classes: 0.0
1/15 classes: 0.0
2/15 classes: 0.0
3/15 classes: 0.0
4/15 classes: 0.0
5/15 classes: 0.0
6/15 classes: 0.0
7/15 classes: 0.0
8/15 classes: 0.0
9/15 classes: 0.0
10/15 classes: 0.0
11/15 classes: 0.0
12/15 classes: 0.8948545861297539
13/15 classes: 6.86055182699478
14/15 classes: 18.94108873974646
15/15 classes: 73.303504847129


## Referenda - Otimização

In [17]:
import os
import sys
os.chdir("/home/caiocampos/andre/radar_wisemap")
sys.path.append('hypermapper/scripts/')
import hypermapper

iterations = 0

def optimize_CNN_minist(parameters):
    global iterations
    f1s = 0
    f1s_per_class = []
    precision_per_class = []
    recall_per_class = []
    beta_2_real_p = [0.99,0.999,0.9999]  # O parâmetro beta2 é traduzido de categórico para um dos três reais descritos no paper do Adam.
    
    for i in range(len(train_sets)):        
        sets_index = []
        for t in range(len(train_sets)):
            if t!=i:
                sets_index.append(t)

        train_X = np.concatenate(np.array(sets_X, dtype=object)[sets_index])
        validation_X = sets_X[i]

        train_Y_minist = np.concatenate(np.array(sets_Y_minist, dtype=object)[sets_index])
        train_Y_minist = np.array([np.array(t) for t in train_Y_minist])
        validation_Y_minist = sets_Y_minist[i]
        validation_Y_minist = np.array([np.array(v) for v in validation_Y_minist])

        model = CNN_model(len(Train_Y_minist[0]))
        opt = Adam(lr=parameters['learning_rate'], beta_1=parameters['beta1'], beta_2=beta_2_real_p[parameters['beta2']])
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
        history = model.fit(x=np.array(train_X), y=np.array(train_Y_minist), epochs=parameters['epochs'], validation_split=0.0, class_weight=classes_w_sen, batch_size=parameters['batch_size'], shuffle=False, verbose=0)
        validation_prediction = model.predict(np.array(validation_X))

        f1s+=f1_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average='macro')
        f1s_per_class.append(f1_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average=None))
        precision_per_class.append(precision_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average=None))
        recall_per_class.append(recall_score(y_true=validation_Y_minist, y_pred=predict_classes(validation_prediction), average=None))
    generate_validation_csv(f1s_per_class, precision_per_class, recall_per_class, "CNN_minist_", minist_classes_names)
    f1_loss=1-f1s/4
    print("Iteration "+str(iterations)+":\nlearning_rate: ", parameters['learning_rate'], " || beta1: ", parameters['beta1'], " || beta2: ", beta_2_real_p[parameters['beta2']], " || epochs: ", parameters['epochs'], " || batch_size: ", parameters['batch_size'], " || (1 - macro_F1): ", f1_loss)
    iterations+=1
    return f1_loss

os.chdir("hypermapper/")
stdout = sys.stdout
print(os.getcwd())
np.random.seed(SEED)
hypermapper.optimize("../optimization/HAN_scenario.json", optimize_CNN_minist)
os.chdir("/home/caiocampos/andre/radar_wisemap/")
sys.stdout = stdout

/home/caiocampos/andre/radar_wisemap/hypermapper
Design of experiment phase, number of doe samples = 20 .......
Iteration 0:
learning_rate:  0.09278182926485977  || beta1:  0.24833986126458923  || beta2:  0.9999  || epochs:  12  || batch_size:  72  || (1 - macro_F1):  0.5921386085485915




Iteration 1:
learning_rate:  0.022357979570786328  || beta1:  0.013972568583595779  || beta2:  0.999  || epochs:  22  || batch_size:  71  || (1 - macro_F1):  0.35286634266987493
Iteration 2:
learning_rate:  0.04313564035421596  || beta1:  0.2152742942502234  || beta2:  0.9999  || epochs:  17  || batch_size:  27  || (1 - macro_F1):  0.5024047235143742
Iteration 3:
learning_rate:  0.07544250836061345  || beta1:  0.4213387686846669  || beta2:  0.99  || epochs:  11  || batch_size:  49  || (1 - macro_F1):  0.49380541320372817
Iteration 4:
learning_rate:  0.023440346111376964  || beta1:  0.2621771058151551  || beta2:  0.99  || epochs:  11  || batch_size:  27  || (1 - macro_F1):  0.45094467910161584
Iteration 5:
learning_rate:  0.00558075747037758  || beta1:  0.05527036236172814  || beta2:  0.999  || epochs:  25  || batch_size:  52  || (1 - macro_F1):  0.2821799844116415
Iteration 6:
learning_rate:  0.07743990549741497  || beta1:  0.8740694835703358  || beta2:  0.9999  || epochs:  6  || batch



Iteration 21:
learning_rate:  4.576726164470858e-05  || beta1:  0.9  || beta2:  0.999  || epochs:  13  || batch_size:  56  || (1 - macro_F1):  0.891068611466175
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
4.576726164470858e-05,0.9,1,13,56,0.891068611466175,3441542

Starting optimization iteration 3
Iteration 22:
learning_rate:  0.008177578862257324  || beta1:  0.9  || beta2:  0.999  || epochs:  11  || batch_size:  55  || (1 - macro_F1):  0.3127991736953689
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.008177578862257324,0.9,1,11,55,0.3127991736953689,3553492

Starting optimization iteration 4
Iteration 23:
learning_rate:  0.046107375961732394  || beta1:  0.29377454961533545  || beta2:  0.999  || epochs:  18  || batch_size:  59  || (1 - macro_F1):  0.5051001015811815
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.046107375961732394,0.29377454961533545,1,18,59,0.5051001015811815,3728267

Starting optimization iteration 5
Iteration 24:
lea



Iteration 47:
learning_rate:  0.00558075747037758  || beta1:  0.0619190351170853  || beta2:  0.9999  || epochs:  25  || batch_size:  65  || (1 - macro_F1):  0.2705230435409328
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.00558075747037758,0.0619190351170853,2,25,65,0.2705230435409328,8500989

Starting optimization iteration 29
Iteration 48:
learning_rate:  0.003235276663308036  || beta1:  0.28347973510649993  || beta2:  0.9999  || epochs:  25  || batch_size:  63  || (1 - macro_F1):  0.26939808322612024
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.003235276663308036,0.28347973510649993,2,25,63,0.26939808322612024,8729155

Starting optimization iteration 30
Iteration 49:
learning_rate:  0.006134513804419533  || beta1:  0.9  || beta2:  0.999  || epochs:  11  || batch_size:  87  || (1 - macro_F1):  0.28523219503639174
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.006134513804419533,0.9,1,11,87,0.28523219503639174,8826945

Starting optimi



Iteration 88:
learning_rate:  9.691000755698013e-05  || beta1:  0.9  || beta2:  0.9999  || epochs:  27  || batch_size:  99  || (1 - macro_F1):  0.6728730415397818
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
9.691000755698013e-05,0.9,2,27,99,0.6728730415397818,17032772

Starting optimization iteration 70
Iteration 89:
learning_rate:  0.006217031802288273  || beta1:  0.10534794068424523  || beta2:  0.99  || epochs:  25  || batch_size:  94  || (1 - macro_F1):  0.27817253542901943
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.006217031802288273,0.10534794068424523,0,25,94,0.27817253542901943,17237296

Starting optimization iteration 71
Iteration 90:
learning_rate:  0.0005496039498673612  || beta1:  0.12381326553388651  || beta2:  0.99  || epochs:  27  || batch_size:  16  || (1 - macro_F1):  0.32271508921105496
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0005496039498673612,0.12381326553388651,0,27,16,0.32271508921105496,17694515

Startin



Iteration 94:
learning_rate:  0.06364714947916167  || beta1:  0.28653184246859004  || beta2:  0.99  || epochs:  4  || batch_size:  47  || (1 - macro_F1):  0.6624985012576788
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.06364714947916167,0.28653184246859004,0,4,47,0.6624985012576788,18611960

Starting optimization iteration 76
Iteration 95:
learning_rate:  0.0006445484508916296  || beta1:  0.9  || beta2:  0.99  || epochs:  22  || batch_size:  60  || (1 - macro_F1):  0.2879151553389767
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.0006445484508916296,0.9,0,22,60,0.2879151553389767,18817929

Starting optimization iteration 77
Iteration 96:
learning_rate:  0.000687531594901688  || beta1:  0.6111916497487694  || beta2:  0.9999  || epochs:  25  || batch_size:  96  || (1 - macro_F1):  0.30082508481616865
learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
0.000687531594901688,0.6111916497487694,2,25,96,0.30082508481616865,19021076

Starting optimiza

In [22]:
optimization_results_minist = pd.read_csv("optimization/minist_cnn_validation_hypermapper_output.csv")
best_parameters_idx_minist = optimization_results_minist["1 - F1"].idxmin()
parameters_minist = pd.DataFrame(optimization_results_minist.iloc[best_parameters_idx_minist]).transpose()
parameters_minist

Unnamed: 0,learning_rate,beta1,beta2,epochs,batch_size,1 - F1,Timestamp
61,0.003483,0.238497,2.0,25.0,45.0,0.261838,11380021.0


In [23]:
validation_results_minist = pd.read_csv("optimization/validation_results/CNN_minist_iteration_"+str(best_parameters_idx_minist)+".csv", index_col=0)
validation_results_minist

Unnamed: 0,Saúde,Relações Exteriores,Meio ambiente,"Educação, cultura e esporte",Justiça e Segurança,Trabalho e Previdência,Transporte/transporte de mercadorias,"Agricultura, pecuária e pesca",Ciência e tecnologia,Social,Presidência,Economia e planejamento,"Indústria, comércio, obras públicas, turismo"
validation set 1 f1,0.722222,0.966216,0.719101,0.838323,0.758824,0.685185,0.576271,0.844444,0.737864,0.603448,0.493421,0.862097,0.703833
validation set 2 f1,0.636364,0.956081,0.764045,0.814371,0.788235,0.699029,0.763636,0.673913,0.697248,0.66087,0.501608,0.86749,0.694534
validation set 3 f1,0.685714,0.959184,0.8,0.775,0.797753,0.730435,0.655738,0.729167,0.757282,0.608,0.501742,0.890851,0.697819
validation set 4 f1,0.717949,0.95189,0.836735,0.826347,0.791667,0.789916,0.571429,0.75,0.78,0.615385,0.539007,0.881988,0.714777
validation sets mean f1,0.690562,0.958343,0.77997,0.81351,0.78412,0.726141,0.641768,0.749381,0.743098,0.621926,0.508945,0.875606,0.702741
validation std f1,0.039637,0.00604,0.050273,0.027473,0.017317,0.046541,0.089971,0.071043,0.035084,0.026424,0.020416,0.013184,0.0089
validation set 1 precision,0.8125,0.989619,0.8,0.921053,0.854305,0.74,0.708333,0.95,0.904762,0.813953,0.619835,0.925532,0.848739
validation set 2 precision,0.84,0.986063,0.894737,0.883117,0.853503,0.818182,0.875,0.861111,0.826087,0.844444,0.609375,0.931095,0.782609
validation set 3 precision,0.774194,0.989474,0.837209,0.885714,0.845238,0.792453,0.8,0.795455,0.906977,0.703704,0.685714,0.929648,0.777778
validation set 4 precision,0.8,0.989286,0.854167,0.884615,0.904762,0.839286,0.64,0.818182,0.906977,0.8,0.783505,0.894488,0.852459


## Final "referenda" evaluation on test data

In [22]:
beta_2_real_p = [0.99,0.999,0.9999]

Train_Y_minist = np.array([np.array(t) for t in Train_Y_minist])
Test_Y_minist = np.array([np.array(t) for t in Test_Y_minist])

model = CNN_model(len(Train_Y_minist[0]))

opt = Adam(lr = parameters_minist["learning_rate"].iloc[0], beta_1 = parameters_minist["beta1"].iloc[0], beta_2 = beta_2_real_p[int(parameters_minist["beta2"].iloc[0])])
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[get_f1])
history = model.fit(x=np.array(Train_X), y=np.array(Train_Y_minist), epochs=int(parameters_minist["epochs"].iloc[0]), validation_split=0.0, class_weight=classes_w_minist, batch_size=int(parameters_minist["batch_size"].iloc[0]), shuffle=False)
test_prediction = model.predict(np.array(Test_X))
print("Referenda - CNN tuned")
print(classification_report(y_true=np.array(Test_Y_minist), y_pred=predict_classes(test_prediction), digits=3, target_names=minist_classes_names))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Referenda - CNN tuned
                                              precision    recall  f1-score   support

                                       Saúde      0.806     0.690     0.744        42
                         Relações Exteriores      0.983     0.935     0.958       308
                               Meio ambiente      0.872     0.739     0.800        46
                 Educação, cultura e esporte      0.865     0.719     0.785        89
                         Justiça e Segurança      0.895     0.775     0.831       187
                      Trabalho e Previdência      0.750     0.508     0.606        59
        Transporte/transporte de mercadorias      0.840     0.700     0.764        3

In [25]:
#Saving final model
model.save('models/CNN_minist_model_final')
pd.DataFrame(predict_classes(test_prediction)).to_csv("test_results/CNN_minist.csv", index=None, header=None)

In [26]:
for i in range (len(Test_Y_minist[0])+1):
    print(str(i)+"/"+str(len(Test_Y_minist[0]))+" classes:", acerto_absoluto(Test_Y_minist, test_prediction, i))

0/13 classes: 0.0
1/13 classes: 0.0
2/13 classes: 0.0
3/13 classes: 0.0
4/13 classes: 0.07457121551081282
5/13 classes: 0.0
6/13 classes: 0.0
7/13 classes: 0.2982848620432513
8/13 classes: 0.22371364653243847
9/13 classes: 0.5219985085756897
10/13 classes: 1.6405667412378822
11/13 classes: 7.606263982102908
12/13 classes: 24.459358687546604
13/13 classes: 65.17524235645041
