In [1]:
#!pip install tensorflow-gpu
# Downgrade 
#smart_open to 1.10.0 -> https://github.com/RaRe-Technologies/smart_open/issues/475
# python -m pip install -U smart_open==1.10.0

In [2]:
import os
%load_ext tensorboard

In [3]:
###########################################
# How to make deterministic experiments?
###########################################

# Main Sources:
    # 1) https://github.com/NVIDIA/tensorflow-determinism
    # 2) https://pypi.org/project/tensorflow-determinism/#description
            # There are currently two main ways to access GPU-deterministic functionality in TensorFlow for most
            # deep learning applications. 
            # 2.1) The first way is to use an NVIDIA NGC TensorFlow container. - https://www.nvidia.com/en-us/gpu-cloud/containers/
            # 2.2. The second way is to use version 1.14, 1.15, or 2.0 of stock TensorFlow with GPU support, 
            #      plus the application of a patch supplied in this repo.

# # # Ensure Deterministic behaviour
import random
import numpy as np
import tensorflow as tf
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Now using tensorflow 2.1.0, so no need to patch
# from tfdeterminism import patch
#patch()

seed = 42
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
############################################

In [4]:
# utils
from distutils.version import LooseVersion
from tqdm import tqdm_notebook
from datetime import datetime
from time import time

import warnings
import pickle
import gc
import sys
from json import dumps
import itertools
import re

# Data
import pandas as pd
import spacy

# Viz
import matplotlib.pyplot as plt

# Machine Learning
import tensorflow.keras.backend as K
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Bidirectional, LSTM, Flatten, Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.callbacks.callbacks import EarlyStopping
#from keras.callbacks import EarlyStopping, TensorBoard
#from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from keras_self_attention import SeqSelfAttention
from keras_multi_head import MultiHead, MultiHeadAttention

from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# NLP Models
from gensim.models import Word2Vec, KeyedVectors
#w2v_models_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/02. Notebooks/models/'
w2v_models_path = 'D:/Mestrado/Dissertação/07 .Dissertação Final/02. Experimentos/02. Word Embbedings/'

Using TensorFlow backend.


In [5]:
# METRICS
# def f1_score(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

# #     # If there are no true samples, fix the F1 score at 0.
# #     if c3 == 0:
# #         return 0

#     # How many selected items are relevant?
#     precision = c1 / c2

#     # How many relevant items are selected?
#     recall = c1 / c3

#     # Calculate f1_score
#     f1_score = 2 * (precision * recall) / (precision + recall)
#     return f1_score

# def recall(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

# #     # If there are no true samples, fix the F1 score at 0.
# #     if c3 == 0:
# #         return 0

#     # How many relevant items are selected?
#     recall = c1 / c3
    
#     return recall


# def precision(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))

#     # How many selected items are relevant?
#     precision = c1 / c2

#     return precision

# https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
print('tf version: ' + tf.__version__)

tf version: 2.1.0-rc2


In [7]:
#Variables
current_exp = 'Atendimento-Balanced-Multiclass'
if 'Binary' in current_exp:
    binary = True
else:
    binary = False
    
base_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/03. Datasets/' + current_exp
save_path = 'output'

sentence = 'req-text'
label = 'Atendimento'

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')

#Checking on data
print(X_train.columns)
print(X_train.shape)
print(y_train[label].value_counts())
print(y_test[label].value_counts())

Index(['pid', 'req-text', 'resp-text', '1funct-request', '2pronoun-request',
       '3ppron-request', '4i-request', '5we-request', '6you-request',
       '7shehe-request',
       ...
       '58home-response', '59money-response', '60relig-response',
       '61death-response', '62assent-response', '63nonfl-response',
       '64filler-response', 'Clareza', 'Atendimento', 'tempo_resposta'],
      dtype='object', length=134)
(6982, 134)
0    2367
1    2319
2    2296
Name: Atendimento, dtype: int64
2    1029
1    1006
0     958
Name: Atendimento, dtype: int64


In [8]:
# Keep only text columns
X_train.drop(columns=X_train.columns[2:], inplace=True)
X_test.drop(columns=X_train.columns[2:], inplace=True)

In [9]:
###################################################
X_train['sentence'] = X_train[sentence]
X_test['sentence'] = X_test[sentence]

y_train['label'] = y_train[label]
y_test['label'] = y_test[label]

##################################################################
# CUT DATAFRAME
# factor = 10000
# df = pd.concat([df[df.label=='1'][0:factor], df[df.label=='0'][0:factor]])
##################################################################

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(X_train.shape[0]))

# Report the classes balance.
print('Classes distribuition: \n')
print(y_train[label].value_counts())

# Display 10 random rows from the data.
X_train.sample(10)

Number of training sentences: 6,982

Classes distribuition: 

0    2367
1    2319
2    2296
Name: Atendimento, dtype: int64


Unnamed: 0,pid,req-text,sentence
4119,414762,Gostaria de receber todas as informações refer...,Gostaria de receber todas as informações refer...
3477,423065,"Prezados, boa tarde! Solicito a gentileza de i...","Prezados, boa tarde! Solicito a gentileza de i..."
4322,432042,1. PEDIDOS DE ACESSO À INFORMAÇÃO 1.1. PEDIDO ...,1. PEDIDOS DE ACESSO À INFORMAÇÃO 1.1. PEDIDO ...
3349,553831,Considerando o posicionamento formal da DIPAR ...,Considerando o posicionamento formal da DIPAR ...
469,546829,"Prezados senhores, solicito para fins acadêmic...","Prezados senhores, solicito para fins acadêmic..."
538,467278,NÚMEROS DE 2017 DE CARGOS VAGOS DE SERVIDORES ...,NÚMEROS DE 2017 DE CARGOS VAGOS DE SERVIDORES ...
6636,347965,Gostaria de saber quando serão convocados os a...,Gostaria de saber quando serão convocados os a...
5820,445187,O pedido se encontra no anexo abaixo.,O pedido se encontra no anexo abaixo.
1047,565317,1.\tA Instituição utiliza sistema de cotas? 1....,1.\tA Instituição utiliza sistema de cotas? 1....
1219,327624,Estou esperando a nomeação pelo concurso reali...,Estou esperando a nomeação pelo concurso reali...


In [10]:
# Looking lengths
lengths = [X_train.sentence.apply(lambda x: len(x.split(' ')))]
perc =[.25, .50, .75, .80, .85, .90, .91, .92, .93, .94, .95, .96, .97, .98, .99] 
lengths[0].describe(percentiles = perc)

count    6982.000000
mean       75.929390
std        65.344213
min         1.000000
25%        32.000000
50%        54.000000
75%        97.000000
80%       110.000000
85%       135.000000
90%       168.900000
91%       177.000000
92%       188.000000
93%       201.330000
94%       218.000000
95%       227.950000
96%       246.000000
97%       265.000000
98%       281.000000
99%       299.000000
max       359.000000
Name: sentence, dtype: float64

# w2v Model for Embedding Layer

In [11]:
#w2v_cbow_esic_model=KeyedVectors.load(os.path.join(w2v_models_path,'word2vec_sg_hs_DetalhamentoSolicitacao_all_sentences_128.model'))
w2v_cbow_nilc_model=KeyedVectors.load_word2vec_format(os.path.join(w2v_models_path,'cbow_s300.txt'))

In [12]:
pretrained_weights = w2v_cbow_nilc_model.wv.syn0
print(pretrained_weights.shape)
max_num_words = pretrained_weights.shape[0]
embed_size = pretrained_weights.shape[1]

(929606, 300)


  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


# Data Prep

In [13]:
max_length=128

# Define tokenizer and fit train data
t = Tokenizer(num_words=max_num_words)
t.fit_on_texts(X_train['sentence'].append(X_test['sentence']))
word_index = t.word_index
vocab_size = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))
    
def get_seqs(text):    
    sequences = t.texts_to_sequences(text)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences

Found 34690 unique tokens.


In [14]:
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    if binary:
        return y_train_enc, y_test_enc
    else:
        return pd.get_dummies(y_train_enc), pd.get_dummies(y_test_enc)

In [15]:
# X and Y
label_train, label_test = prepare_targets(y_train.label.values, y_test.label.values)
num_labels = len(set(label_train))
input_train = get_seqs(X_train.sentence)
input_test = get_seqs(X_test.sentence)

# Modeling

In [16]:
embedding_matrix = np.zeros((vocab_size, embed_size))
for word, i in t.word_index.items():
    try:
        embedding_vector = w2v_cbow_nilc_model.wv.__getitem__(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    # Words not in vocab -> Frequency less than 5 word
    except KeyError as e:
        print(e)

  after removing the cwd from sys.path.


"word '1' not in vocabulary"
"word 'à' not in vocabulary"
"word '2016' not in vocabulary"
"word '2015' not in vocabulary"
"word '2' not in vocabulary"
"word '3' not in vocabulary"
"word '2014' not in vocabulary"
"word '2017' not in vocabulary"
"word '–' not in vocabulary"
"word '12' not in vocabulary"
"word '4' not in vocabulary"
"word '5' not in vocabulary"
"word '10' not in vocabulary"
"word '2013' not in vocabulary"
"word '2011' not in vocabulary"
"word '11' not in vocabulary"
"word '2012' not in vocabulary"
"word '01' not in vocabulary"
"word '6' not in vocabulary"
"word '7' not in vocabulary"
"word '8' not in vocabulary"
"word 'às' not in vocabulary"
"word '06' not in vocabulary"
"word '05' not in vocabulary"
"word '2010' not in vocabulary"
"word '9' not in vocabulary"
"word '527' not in vocabulary"
"word '30' not in vocabulary"
"word '03' not in vocabulary"
"word '18' not in vocabulary"
"word '15' not in vocabulary"
"word '02' not in vocabulary"
"word '20' not in vocabulary"
"wor

"word '294' not in vocabulary"
"word '387' not in vocabulary"
"word '541' not in vocabulary"
"word '380' not in vocabulary"
"word '573' not in vocabulary"
"word '“sigiloso”' not in vocabulary"
"word 'www4' not in vocabulary"
"word 'cgpert' not in vocabulary"
"word '474' not in vocabulary"
"word 'pfaneel' not in vocabulary"
"word '293' not in vocabulary"
"word '512' not in vocabulary"
"word '726' not in vocabulary"
"word 'brasil”' not in vocabulary"
"word '227' not in vocabulary"
"word '23228' not in vocabulary"
"word '030' not in vocabulary"
"word '138' not in vocabulary"
"word '13º' not in vocabulary"
"word 'plhis' not in vocabulary"
"word 'mcidades' not in vocabulary"
"word 'asfoc' not in vocabulary"
"word '552' not in vocabulary"
"word '715' not in vocabulary"
"word '3515' not in vocabulary"
"word '459' not in vocabulary"
"word '376' not in vocabulary"
"word '655' not in vocabulary"
"word '526' not in vocabulary"
"word 'dgpm' not in vocabulary"
"word '881' not in vocabulary"
"word '

"word '“c”' not in vocabulary"
"word '558' not in vocabulary"
"word 'fopag' not in vocabulary"
"word '158267' not in vocabulary"
"word '070' not in vocabulary"
"word '355' not in vocabulary"
"word 'a1' not in vocabulary"
"word '183' not in vocabulary"
"word 'srrf10' not in vocabulary"
"word '367' not in vocabulary"
"word 'cpgf' not in vocabulary"
"word '3808' not in vocabulary"
"word '221' not in vocabulary"
"word '060' not in vocabulary"
"word '1065' not in vocabulary"
"word 'v1' not in vocabulary"
"word 'v2' not in vocabulary"
"word 'hctm' not in vocabulary"
"word '“que' not in vocabulary"
"word '611' not in vocabulary"
"word '04941' not in vocabulary"
"word '432' not in vocabulary"
"word '1330' not in vocabulary"
"word 'hnas' not in vocabulary"
"word '974' not in vocabulary"
"word '415' not in vocabulary"
"word 'siass' not in vocabulary"
"word '808' not in vocabulary"
"word '410' not in vocabulary"
"word '23034' not in vocabulary"
"word '412' not in vocabulary"
"word '490' not in vo

"word '830' not in vocabulary"
"word 'apostilamentos' not in vocabulary"
"word 'v2225' not in vocabulary"
"word 'missings' not in vocabulary"
"word 'gerec' not in vocabulary"
"word 'roldao' not in vocabulary"
"word '553' not in vocabulary"
"word 'ouvid' not in vocabulary"
"word '“diagnóstico' not in vocabulary"
"word 'pessoas”' not in vocabulary"
"word '748' not in vocabulary"
"word 'ltcat' not in vocabulary"
"word '1800' not in vocabulary"
"word 'ballande' not in vocabulary"
"word '03000001671201732' not in vocabulary"
"word '360770' not in vocabulary"
"word 'sgpr' not in vocabulary"
"word '23136' not in vocabulary"
"word 'registrato' not in vocabulary"
"word 'adaury' not in vocabulary"
"word 'certificaçao' not in vocabulary"
"word 'nº12' not in vocabulary"
"word '55000' not in vocabulary"
"word 'proae' not in vocabulary"
"word '689' not in vocabulary"
"word '000069' not in vocabulary"
"word 'gol9109' not in vocabulary"
"word 'sbfz' not in vocabulary"
"word '821' not in vocabulary"
"w

"word 'multifinalitários' not in vocabulary"
"word 'cobfinamento' not in vocabulary"
"word 'soliciito' not in vocabulary"
"word 'dtr2004' not in vocabulary"
"word 'sinanweb' not in vocabulary"
"word 'envenvondo' not in vocabulary"
"word 'mazali' not in vocabulary"
"word '99510625' not in vocabulary"
"word 'quaando' not in vocabulary"
"word 'aisipoa' not in vocabulary"
"word 'cgsiie' not in vocabulary"
"word '03950001024201520' not in vocabulary"
"word 'eletrônio' not in vocabulary"
"word 'aeroporta' not in vocabulary"
"word 'coopertáxi' not in vocabulary"
"word 'capistana' not in vocabulary"
"word 'echeli' not in vocabulary"
"word 'transacao' not in vocabulary"
"word 'nº181' not in vocabulary"
"word '0030684' not in vocabulary"
"word '33760398' not in vocabulary"
"word '003216' not in vocabulary"
"word 'alimentaçaõ' not in vocabulary"
"word 'acumulaçao' not in vocabulary"
"word '542' not in vocabulary"
"word 'e10' not in vocabulary"
"word 'e14' not in vocabulary"
"word 'i99' not in voc

"word '08505' not in vocabulary"
"word '21175' not in vocabulary"
"word 'sabwr' not in vocabulary"
"word 'gênero”' not in vocabulary"
"word 'multiinstitucional' not in vocabulary"
"word '00075000453201557' not in vocabulary"
"word 'mirkovski' not in vocabulary"
"word '2115220' not in vocabulary"
"word '741752' not in vocabulary"
"word '255' not in vocabulary"
"word '406' not in vocabulary"
"word '755' not in vocabulary"
"word '“guerrilha' not in vocabulary"
"word 'coopema' not in vocabulary"
"word 'ebbt' not in vocabulary"
"word 'siopi' not in vocabulary"
"word '1911' not in vocabulary"
"word '0001090' not in vocabulary"
"word 'agrosilvipastoril' not in vocabulary"
"word 'conhecidencia' not in vocabulary"
"word '08802' not in vocabulary"
"word '004908' not in vocabulary"
"word 'banjamim' not in vocabulary"
"word '000255' not in vocabulary"
"word '99455648' not in vocabulary"
"word 'reinilson' not in vocabulary"
"word 'entrvistou' not in vocabulary"
"word '99144953' not in vocabulary"
"

"word '794' not in vocabulary"
"word '16853007673201665' not in vocabulary"
"word 'desituação' not in vocabulary"
"word '920892' not in vocabulary"
"word 'b5b334898' not in vocabulary"
"word 'trucam' not in vocabulary"
"word '80810' not in vocabulary"
"word '0808' not in vocabulary"
"word 'conuni' not in vocabulary"
"word '9o' not in vocabulary"
"word '10o' not in vocabulary"
"word '46800' not in vocabulary"
"word 'tayaramelo' not in vocabulary"
"word '10986' not in vocabulary"
"word 'atenciosamnete' not in vocabulary"
"word 'mecabo' not in vocabulary"
"word '3268' not in vocabulary"
"word '7177' not in vocabulary"
"word 'ghtml' not in vocabulary"
"word '3288' not in vocabulary"
"word 'soccol' not in vocabulary"
"word '98179' not in vocabulary"
"word '3338' not in vocabulary"
"word 'canchos' not in vocabulary"
"word 'congelas' not in vocabulary"
"word 'frescais' not in vocabulary"
"word '033872' not in vocabulary"
"word 'inexitosos' not in vocabulary"
"word '006075' not in vocabulary"


"word 'silíciomanganês' not in vocabulary"
"word 'rgis' not in vocabulary"
"word '2376' not in vocabulary"
"word 'insrf' not in vocabulary"
"word '697' not in vocabulary"
"word 'cgpar' not in vocabulary"
"word '10h49' not in vocabulary"
"word '13h45' not in vocabulary"
"word 'paaron' not in vocabulary"
"word '23480013188201752' not in vocabulary"
"word '000857' not in vocabulary"
"word '008610' not in vocabulary"
"word 'resjustado' not in vocabulary"
"word '006107' not in vocabulary"
"word '91662543' not in vocabulary"
"word '96444775' not in vocabulary"
"word '26233' not in vocabulary"
"word '701043' not in vocabulary"
"word '0226156' not in vocabulary"
"word 'mauc' not in vocabulary"
"word 'sipaf' not in vocabulary"
"word '005160' not in vocabulary"
"word 'uperintendência' not in vocabulary"
"word '011081' not in vocabulary"
"word '4600508781' not in vocabulary"
"word '25820001978201551' not in vocabulary"
"word '54100' not in vocabulary"
"word '002189' not in vocabulary"
"word '5450

"word 'conosco”' not in vocabulary"
"word '“atendimento”' not in vocabulary"
"word '725' not in vocabulary"
"word '7474' not in vocabulary"
"word '18h' not in vocabulary"
"word 'milkitar' not in vocabulary"
"word 'expcex' not in vocabulary"
"word '03886112608' not in vocabulary"
"word 'cogtl' not in vocabulary"
"word '1795541' not in vocabulary"
"word '003621' not in vocabulary"
"word 'ilhotinha' not in vocabulary"
"word '016444' not in vocabulary"
"word '005775' not in vocabulary"
"word '002423' not in vocabulary"
"word '002661' not in vocabulary"
"word '000236' not in vocabulary"
"word '000121' not in vocabulary"
"word '000528' not in vocabulary"
"word '000381' not in vocabulary"
"word '000595' not in vocabulary"
"word '000009' not in vocabulary"
"word '003429' not in vocabulary"
"word 'burocratica' not in vocabulary"
"word 'atcta' not in vocabulary"
"word 'consultapublica' not in vocabulary"
"word '785' not in vocabulary"
"word 'falcização' not in vocabulary"
"word 'macroregião' not

"word '4530' not in vocabulary"
"word 'segundas´feiras' not in vocabulary"
"word 'recursosw' not in vocabulary"
"word '3223' not in vocabulary"
"word '1111' not in vocabulary"
"word 'br030' not in vocabulary"
"word 'br499' not in vocabulary"
"word 'eículo' not in vocabulary"
"word 'apólise' not in vocabulary"
"word 'precismo' not in vocabulary"
"word '60873' not in vocabulary"
"word '4527' not in vocabulary"
"word 'elieuda' not in vocabulary"
"word '8533' not in vocabulary"
"word '2146' not in vocabulary"
"word 'progest' not in vocabulary"
"word 'cgsis' not in vocabulary"
"word 'dominaconcursos' not in vocabulary"
"word 'ilucidem' not in vocabulary"
"word 'marliére' not in vocabulary"
"word 'visualizacv' not in vocabulary"
"word 'k4426448z6' not in vocabulary"
"word '07ª' not in vocabulary"
"word 'cesgrario' not in vocabulary"
"word '17°' not in vocabulary"
"word '0589' not in vocabulary"
"word 'kywal' not in vocabulary"
"word 'beledelli' not in vocabulary"
"word '1270311' not in vocab

"word '0033' not in vocabulary"
"word '0034' not in vocabulary"
"word 'ifbas' not in vocabulary"
"word 'leniencia' not in vocabulary"
"word 'eficiência”' not in vocabulary"
"word 'desarazoado' not in vocabulary"
"word '4600410815' not in vocabulary"
"word 'marianoi' not in vocabulary"
"word 'pelka' not in vocabulary"
"word '02680002098201629' not in vocabulary"
"word '714' not in vocabulary"
"word '301550' not in vocabulary"
"word 'motivio' not in vocabulary"
"word 'reticiente' not in vocabulary"
"word 'jullgou' not in vocabulary"
"word '03000001612201683' not in vocabulary"
"word 'desenvolvolvimento' not in vocabulary"
"word 'faeb' not in vocabulary"
"word '4600425298' not in vocabulary"
"word '20154' not in vocabulary"
"word '55h' not in vocabulary"
"word 'mfdv' not in vocabulary"
"word 'gujão' not in vocabulary"
"word 'avinor' not in vocabulary"
"word 'avigran' not in vocabulary"
"word 'naturaves' not in vocabulary"
"word 'avigro' not in vocabulary"
"word 'nº001' not in vocabulary"


"word '01250' not in vocabulary"
"word '022738' not in vocabulary"
"word '00009661' not in vocabulary"
"word '017810' not in vocabulary"
"word 'iluminaçao' not in vocabulary"
"word 'capacitacao' not in vocabulary"
"word 'gedep' not in vocabulary"
"word 'adional' not in vocabulary"
"word '3281' not in vocabulary"
"word '2109' not in vocabulary"
"word '50606002206' not in vocabulary"
"word '9823' not in vocabulary"
"word '99101' not in vocabulary"
"word '38025' not in vocabulary"
"word 'logíst' not in vocabulary"
"word 'geren' not in vocabulary"
"word 'bossardi' not in vocabulary"
"word '1923800998058989' not in vocabulary"
"word '423' not in vocabulary"
"word 'nº24' not in vocabulary"
"word '28043014' not in vocabulary"
"word '59626480' not in vocabulary"
"word '98748' not in vocabulary"
"word '3892' not in vocabulary"
"word '364bmg0350' not in vocabulary"
"word 'guarinhatã' not in vocabulary"
"word '61993128857' not in vocabulary"
"word '002880' not in vocabulary"
"word 'òbito' not in 

In [17]:
# Define model
def train_model(input_train, input_test, label_train, label_test,
                lstm_size=128, dropout=0.2, rec_dropout=0.2, lr=0.005, epochs=50, att_heads=4, max_length=128, 
                vocab_size=None, embed_size=None, emb_trainable=False, batch=128, early_stopping=5,
                save_dir="D:/resultados/checkpoins_solicitacao_keras_mh_att/", best_predefined_f1=0.390):

    # Time now
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Log
    log_dir = save_dir + now
    
    # Model Saver    
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    
    log_file = open(os.path.join(log_dir,"log.txt"), mode="a")
    
    # Save Params
    params = {
        'lstm_size': lstm_size,
        'dropout': dropout,
        'rec_dropout': rec_dropout,
        'lr': lr,
        'epochs': epochs,
        'att_heads': att_heads,
        'max_length': max_length,
        'vocab_size': vocab_size,
        'embed_size': embed_size,
        'emb_trainable': emb_trainable,
        'batch': batch,
        'early_stopping': early_stopping,
        'log_dir': log_dir,
        'best_predefined_f1': best_predefined_f1}
    
    # Saving Parameters
    with open(os.path.join(log_dir, 'params.txt'),'a') as f:
        f.write('\n\n' + ('#'*60))
        f.write('\nParameters:\n')
        f.write('now: ' + str(now))
        f.write('\n' + dumps(params) + '\n')
    
    # input
    inp = Input(shape=(max_length, ))

    # Embedding layer - https://keras.io/layers/embeddings/
    embedding_layer = Embedding(vocab_size,
                                embed_size,                                
                                weights=[embedding_matrix],
                                input_length=max_length,
                                trainable=emb_trainable,
                                name='Embedding')(inp)

    # Bidirectional Layer
    bilstm_layer = Bidirectional(LSTM(
                        units=lstm_size,
                        return_sequences=True,
                        dropout=dropout,
                        recurrent_dropout=rec_dropout,
                        name='LSTM'))(embedding_layer)    

    # MultiHead-Attention Layer
    #https://pypi.org/project/keras-multi-head/
    multiHead_att_layer = MultiHeadAttention(head_num=att_heads, name='Multi-Head-Attention')(bilstm_layer)

    dropout_intermed_layer = Dropout(0.5)(multiHead_att_layer)

    # # Flatten
    flatten_layer = Flatten(name='Flatten')(dropout_intermed_layer)

    dense_intermed_layer = Dense(128, activation='relu')(flatten_layer)
    dropout_intermed_2_layer = Dropout(dropout)(dense_intermed_layer)

    # # # # Dense Layer
    if binary:
        dense_layer = Dense(1, activation='sigmoid')(dropout_intermed_2_layer)    
    else:
        dense_layer = Dense(num_labels, activation='softmax')(dropout_intermed_2_layer)    
    
    model = Model(inputs=inp, outputs=dense_layer)
    # model.summary()
    
    # Compile
    model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy', precision_m, recall_m, f1_m])
    
    # callbacks
    es_callback = EarlyStopping(monitor='val_loss', patience=early_stopping, verbose=1, mode='min')
    
    # Fitting Model
    model.fit(input_train,
              label_train,
              epochs=epochs,
              batch_size=batch,
              validation_data=(input_test, label_test),
              verbose=0,
              callbacks=[es_callback])
    
    # PLOT LOSS
    plt.title('Loss')
    plt.plot(model.history.history['loss'], label='train')
    plt.plot(model.history.history['val_loss'], label='test')
    plt.legend()
    #plt.show();
    plt.savefig(os.path.join(log_dir,'Loss.png'))
    plt.close()
    
    # Classification
    y_pred = model.predict(input_test, batch_size=batch, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)
    
    if not binary:
        label_test = np.argmax(label_test.values, axis=1)
    
    # Metrics
    f1 = f1_score(label_test, y_pred_bool, average='weighted')
    print(f"Best Test F1-Score: {f1:.3f}")
    
    print("#"*60 + '\n', file=log_file)
    print(classification_report(label_test, y_pred_bool), file=log_file)
    print("#"*60+ '\n', file=log_file)
    
    # Flush log file
    log_file.flush()
    log_file.close()
    
    # Save final result
    with open(os.path.join(log_dir[:-16], 'output.txt'),'a') as f:
        f.write('\n\n')
        f.write(log_dir)
        f.write('\n')
        f.write(f"Best Test F1-Score: {f1:.3f}")
        
    # save model and architecture to single file
    if f1 > best_predefined_f1:
        model.save(os.path.join(log_dir, "model.h5"))
        print("Saved model to disk")

In [None]:
# Model Params
lstm_size_list = [256, 512]
dropout_list = [0.25, 0.5]
rec_dropout_list = [0.1, 0.25, 0.5]
lr_list = [1e-3, 5e-4, 1e-4, 5e-5, 5e-6]

all_params = [lstm_size_list] + [dropout_list] + [rec_dropout_list] + [lr_list]

for each in itertools.product(*all_params):    
    lstm_size, dropout, rec_dropout, lr = each
    
    # Params
    print('lstm_size: ' + str(lstm_size))
    print('\tdropout: ' + str(dropout))
    print('\trec_dropout: ' + str(rec_dropout))
    print('\tlr: ' + str(lr))
    
    # train
    train_model(input_train, input_test, label_train, label_test, lstm_size, dropout, rec_dropout, lr, epochs=50,
                att_heads=4, max_length=max_length, vocab_size=vocab_size, embed_size=embed_size, emb_trainable=False, batch=128,
                early_stopping=5, save_dir="D:/Outputs_Mestrado/resultados_Atendimento/checkpoins_solicitacao_keras_mh_att/",
                best_predefined_f1=0.424)

lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 0.001
Epoch 00011: early stopping
Best Test F1-Score: 0.405
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 0.0005
Epoch 00012: early stopping
Best Test F1-Score: 0.401
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 0.0001
Epoch 00012: early stopping
Best Test F1-Score: 0.413
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 5e-05
Epoch 00022: early stopping
Best Test F1-Score: 0.428
Saved model to disk
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 5e-06
Epoch 00050: early stopping
Best Test F1-Score: 0.408
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 0.001
Epoch 00011: early stopping
Best Test F1-Score: 0.411
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 0.0005
Epoch 00013: early stopping
Best Test F1-Score: 0.424
Saved model to disk
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 0.0001
Epoch 00015: early stopping
Best Test F1-Score: 0.407
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
