In [1]:
#!pip install tensorflow-gpu
# Downgrade 
#smart_open to 1.10.0 -> https://github.com/RaRe-Technologies/smart_open/issues/475
# python -m pip install -U smart_open==1.10.0

In [2]:
import os
%load_ext tensorboard

In [3]:
###########################################
# How to make deterministic experiments?
###########################################

# Main Sources:
    # 1) https://github.com/NVIDIA/tensorflow-determinism
    # 2) https://pypi.org/project/tensorflow-determinism/#description
            # There are currently two main ways to access GPU-deterministic functionality in TensorFlow for most
            # deep learning applications. 
            # 2.1) The first way is to use an NVIDIA NGC TensorFlow container. - https://www.nvidia.com/en-us/gpu-cloud/containers/
            # 2.2. The second way is to use version 1.14, 1.15, or 2.0 of stock TensorFlow with GPU support, 
            #      plus the application of a patch supplied in this repo.

# # # Ensure Deterministic behaviour
import random
import numpy as np
import tensorflow as tf
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Now using tensorflow 2.1.0, so no need to patch
# from tfdeterminism import patch
#patch()

seed = 42
os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
############################################

In [4]:
# utils
from distutils.version import LooseVersion
from tqdm import tqdm_notebook
from datetime import datetime
from time import time

import warnings
import pickle
import gc
import sys
from json import dumps
import itertools
import re

# Data
import pandas as pd
import spacy

# Viz
import matplotlib.pyplot as plt

# Machine Learning
import tensorflow.keras.backend as K
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Bidirectional, LSTM, Flatten, Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.callbacks.callbacks import EarlyStopping
#from keras.callbacks import EarlyStopping, TensorBoard
#from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from keras_self_attention import SeqSelfAttention
from keras_multi_head import MultiHead, MultiHeadAttention

from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# NLP Models
from gensim.models import Word2Vec, KeyedVectors
#w2v_models_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/02. Notebooks/models/'
w2v_models_path = 'D:/Mestrado/Dissertação/07 .Dissertação Final/02. Experimentos/02. Word Embbedings/'

Using TensorFlow backend.


In [5]:
# METRICS
# def f1_score(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

# #     # If there are no true samples, fix the F1 score at 0.
# #     if c3 == 0:
# #         return 0

#     # How many selected items are relevant?
#     precision = c1 / c2

#     # How many relevant items are selected?
#     recall = c1 / c3

#     # Calculate f1_score
#     f1_score = 2 * (precision * recall) / (precision + recall)
#     return f1_score

# def recall(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

# #     # If there are no true samples, fix the F1 score at 0.
# #     if c3 == 0:
# #         return 0

#     # How many relevant items are selected?
#     recall = c1 / c3
    
#     return recall


# def precision(y_true, y_pred):

#     # Count positive samples.
#     c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))

#     # How many selected items are relevant?
#     precision = c1 / c2

#     return precision

# https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
print('tf version: ' + tf.__version__)

tf version: 2.1.0-rc2


In [7]:
#Variables
current_exp = 'Atendimento-Balanced-Multiclass'
if 'Binary' in current_exp:
    binary = True
else:
    binary = False
    
base_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/03. Datasets/' + current_exp
save_path = 'output'

sentence = 'resp-text'
label = 'Atendimento'

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')

#Checking on data
print(X_train.columns)
print(X_train.shape)
print(y_train[label].value_counts())
print(y_test[label].value_counts())

Index(['pid', 'req-text', 'resp-text', '1funct-request', '2pronoun-request',
       '3ppron-request', '4i-request', '5we-request', '6you-request',
       '7shehe-request',
       ...
       '58home-response', '59money-response', '60relig-response',
       '61death-response', '62assent-response', '63nonfl-response',
       '64filler-response', 'Clareza', 'Atendimento', 'tempo_resposta'],
      dtype='object', length=134)
(6982, 134)
0    2367
1    2319
2    2296
Name: Atendimento, dtype: int64
2    1029
1    1006
0     958
Name: Atendimento, dtype: int64


In [8]:
# Keep only text columns
X_train.drop(columns=X_train.columns[3:], inplace=True)
X_test.drop(columns=X_train.columns[3:], inplace=True)

In [9]:
###################################################
X_train['sentence'] = X_train[sentence]
X_test['sentence'] = X_test[sentence]

y_train['label'] = y_train[label]
y_test['label'] = y_test[label]

##################################################################
# CUT DATAFRAME
# factor = 10000
# df = pd.concat([df[df.label=='1'][0:factor], df[df.label=='0'][0:factor]])
##################################################################

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(X_train.shape[0]))

# Report the classes balance.
print('Classes distribuition: \n')
print(y_train[label].value_counts())

# Display 10 random rows from the data.
X_train.sample(10)

Number of training sentences: 6,982

Classes distribuition: 

0    2367
1    2319
2    2296
Name: Atendimento, dtype: int64


Unnamed: 0,pid,req-text,resp-text,sentence
4119,414762,Gostaria de receber todas as informações refer...,"Prezado a Cidadão ã , 1 . Conforme solicitação...","Prezado a Cidadão ã , 1 . Conforme solicitação..."
3477,423065,"Prezados, boa tarde! Solicito a gentileza de i...","Prezado Senhor , O processo mencionado nademan...","Prezado Senhor , O processo mencionado nademan..."
4322,432042,1. PEDIDOS DE ACESSO À INFORMAÇÃO 1.1. PEDIDO ...,Segue respostas deste Setor . RESPOSTA A PEDI...,Segue respostas deste Setor . RESPOSTA A PEDI...
3349,553831,Considerando o posicionamento formal da DIPAR ...,"Prezado Senhor Carlos , Permanece o mesmo ente...","Prezado Senhor Carlos , Permanece o mesmo ente..."
469,546829,"Prezados senhores, solicito para fins acadêmic...","Senhora Solicitante , Em atenção à solicitação...","Senhora Solicitante , Em atenção à solicitação..."
538,467278,NÚMEROS DE 2017 DE CARGOS VAGOS DE SERVIDORES ...,"Prezado Sr . Marques , Agradecemos seu contato...","Prezado Sr . Marques , Agradecemos seu contato..."
6636,347965,Gostaria de saber quando serão convocados os a...,Prezado Sr . Obrigado por utilizar esse canal ...,Prezado Sr . Obrigado por utilizar esse canal ...
5820,445187,O pedido se encontra no anexo abaixo.,"Prezado Senhor , Compete à Anatel a administra...","Prezado Senhor , Compete à Anatel a administra..."
1047,565317,1.\tA Instituição utiliza sistema de cotas? 1....,Prezada Aline . Consultamos documentos enviado...,Prezada Aline . Consultamos documentos enviado...
1219,327624,Estou esperando a nomeação pelo concurso reali...,"Prezada, Atualmente, o IF Baiano possui 27 có...","Prezada, Atualmente, o IF Baiano possui 27 có..."


In [10]:
# Looking lengths
lengths = [X_train.sentence.apply(lambda x: len(x.split(' ')))]
perc =[.25, .50, .75, .80, .85, .90, .91, .92, .93, .94, .95, .96, .97, .98, .99] 
lengths[0].describe(percentiles = perc)

count    6982.000000
mean      166.246777
std       163.302486
min         1.000000
25%        56.000000
50%       120.000000
75%       225.000000
80%       255.000000
85%       290.000000
90%       354.000000
91%       366.000000
92%       384.040000
93%       405.000000
94%       431.000000
95%       459.000000
96%       499.000000
97%       539.570000
98%       643.520000
99%       784.520000
max      1885.000000
Name: sentence, dtype: float64

# w2v Model for Embedding Layer

In [11]:
#w2v_cbow_esic_model=KeyedVectors.load(os.path.join(w2v_models_path,'word2vec_sg_hs_DetalhamentoSolicitacao_all_sentences_128.model'))
w2v_cbow_nilc_model=KeyedVectors.load_word2vec_format(os.path.join(w2v_models_path,'cbow_s300.txt'))

In [12]:
pretrained_weights = w2v_cbow_nilc_model.wv.syn0
print(pretrained_weights.shape)
max_num_words = pretrained_weights.shape[0]
embed_size = pretrained_weights.shape[1]

(929606, 300)


  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


# Data Prep

In [13]:
max_length=128

# Define tokenizer and fit train data
t = Tokenizer(num_words=max_num_words)
t.fit_on_texts(X_train['sentence'].append(X_test['sentence']))
word_index = t.word_index
vocab_size = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))
    
def get_seqs(text):    
    sequences = t.texts_to_sequences(text)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences

Found 36763 unique tokens.


In [14]:
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    if binary:
        return y_train_enc, y_test_enc
    else:
        return pd.get_dummies(y_train_enc), pd.get_dummies(y_test_enc)

In [15]:
# X and Y
label_train, label_test = prepare_targets(y_train.label.values, y_test.label.values)
num_labels = len(set(label_train))
input_train = get_seqs(X_train.sentence)
input_test = get_seqs(X_test.sentence)

# Modeling

In [16]:
embedding_matrix = np.zeros((vocab_size, embed_size))
for word, i in t.word_index.items():
    try:
        embedding_vector = w2v_cbow_nilc_model.wv.__getitem__(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    # Words not in vocab -> Frequency less than 5 word
    except KeyError as e:
        print(e)

  after removing the cwd from sys.path.


"word 'à' not in vocabulary"
"word '12' not in vocabulary"
"word '1' not in vocabulary"
"word '7' not in vocabulary"
"word '10' not in vocabulary"
"word '2' not in vocabulary"
"word '2012' not in vocabulary"
"word '724' not in vocabulary"
"word 'às' not in vocabulary"
"word '2011' not in vocabulary"
"word '527' not in vocabulary"
"word '3' not in vocabulary"
"word '2016' not in vocabulary"
"word '2015' not in vocabulary"
"word '4' not in vocabulary"
"word '21' not in vocabulary"
"word '2017' not in vocabulary"
"word '5' not in vocabulary"
"word '0800' not in vocabulary"
"word '2014' not in vocabulary"
"word '15' not in vocabulary"
"word '–' not in vocabulary"
"word '11' not in vocabulary"
"word '18' not in vocabulary"
"word 'acessoainformacao' not in vocabulary"
"word '1ª' not in vocabulary"
"word '61' not in vocabulary"
"word '13' not in vocabulary"
"word '16' not in vocabulary"
"word '08' not in vocabulary"
"word '07' not in vocabulary"
"word '01' not in vocabulary"
"word '2013' not 

"word '465' not in vocabulary"
"word 'dcomp' not in vocabulary"
"word 'cgesp' not in vocabulary"
"word '3232' not in vocabulary"
"word '510' not in vocabulary"
"word '917' not in vocabulary"
"word '603' not in vocabulary"
"word 'nacentral' not in vocabulary"
"word '2566' not in vocabulary"
"word '2019' not in vocabulary"
"word 'portal2' not in vocabulary"
"word '534' not in vocabulary"
"word '1968' not in vocabulary"
"word '1978' not in vocabulary"
"word '1969' not in vocabulary"
"word 'finbra' not in vocabulary"
"word '208' not in vocabulary"
"word '118' not in vocabulary"
"word '4623' not in vocabulary"
"word '6333' not in vocabulary"
"word '6o' not in vocabulary"
"word '141' not in vocabulary"
"word 'wwwanp' not in vocabulary"
"word '742' not in vocabulary"
"word '303' not in vocabulary"
"word '20091' not in vocabulary"
"word 'seps' not in vocabulary"
"word '713' not in vocabulary"
"word '153' not in vocabulary"
"word '6ª' not in vocabulary"
"word '0207' not in vocabulary"
"word '00

"word '1977' not in vocabulary"
"word '2027' not in vocabulary"
"word '9039' not in vocabulary"
"word 'menumercado' not in vocabulary"
"word 'cadastrocorretores' not in vocabulary"
"word 'acessoconsultacorretores' not in vocabulary"
"word '30mb' not in vocabulary"
"word 'demonstracoes' not in vocabulary"
"word 'internacedidos' not in vocabulary"
"word '861' not in vocabulary"
"word '017' not in vocabulary"
"word 'autorizacoes' not in vocabulary"
"word '528' not in vocabulary"
"word 'nº25820' not in vocabulary"
"word '266' not in vocabulary"
"word '977' not in vocabulary"
"word '841' not in vocabulary"
"word '274' not in vocabulary"
"word 'sfg' not in vocabulary"
"word 'vantajosidade' not in vocabulary"
"word '2498' not in vocabulary"
"word '018' not in vocabulary"
"word 'scmed' not in vocabulary"
"word 'naobtenção' not in vocabulary"
"word 'cmar' not in vocabulary"
"word '060' not in vocabulary"
"word 'naloa' not in vocabulary"
"word 'nusel' not in vocabulary"
"word 'mdcr' not in vocab

"word '029' not in vocabulary"
"word '20289580' not in vocabulary"
"word '002260' not in vocabulary"
"word 'degevs' not in vocabulary"
"word '679' not in vocabulary"
"word 'naespecialidade' not in vocabulary"
"word '2222' not in vocabulary"
"word '273' not in vocabulary"
"word 'governoeletronico' not in vocabulary"
"word 'contratacoes' not in vocabulary"
"word 'dicor' not in vocabulary"
"word '2333' not in vocabulary"
"word 'petrolinape' not in vocabulary"
"word 'josianabinda' not in vocabulary"
"word '3583302' not in vocabulary"
"word '157' not in vocabulary"
"word 'narodovia' not in vocabulary"
"word '12ª' not in vocabulary"
"word 'naposição' not in vocabulary"
"word 'paseb' not in vocabulary"
"word 'webarquivos' not in vocabulary"
"word 'corif' not in vocabulary"
"word '780' not in vocabulary"
"word '70058' not in vocabulary"
"word 'nacelebração' not in vocabulary"
"word 'juliananeuenschwander' not in vocabulary"
"word 'magalgães' not in vocabulary"
"word '995' not in vocabulary"
"w

"word 'naaprovação' not in vocabulary"
"word 'cppad' not in vocabulary"
"word 'tcif' not in vocabulary"
"word '000577' not in vocabulary"
"word '23464' not in vocabulary"
"word 'páginainstitucional' not in vocabulary"
"word 'gvims' not in vocabulary"
"word '48580' not in vocabulary"
"word '013273' not in vocabulary"
"word '000365' not in vocabulary"
"word '000620' not in vocabulary"
"word '001258' not in vocabulary"
"word '23546' not in vocabulary"
"word 'pacielo' not in vocabulary"
"word '2681' not in vocabulary"
"word '799' not in vocabulary"
"word 'naoperação' not in vocabulary"
"word 'naproporção' not in vocabulary"
"word '522' not in vocabulary"
"word 'sucumbenciais' not in vocabulary"
"word 'frmaiempregados' not in vocabulary"
"word '000364' not in vocabulary"
"word 'prre' not in vocabulary"
"word '53500' not in vocabulary"
"word 'rcrc' not in vocabulary"
"word '7778' not in vocabulary"
"word 'indigenaatenciosamente' not in vocabulary"
"word 'naedição' not in vocabulary"
"word 'e

"word 'acessoainformação' not in vocabulary"
"word '000641' not in vocabulary"
"word '31327353' not in vocabulary"
"word 'canaparro' not in vocabulary"
"word 'orcamentaria' not in vocabulary"
"word 'crsfn' not in vocabulary"
"word 'superintendencias' not in vocabulary"
"word '000830' not in vocabulary"
"word 'atcta' not in vocabulary"
"word 'cgcap' not in vocabulary"
"word 'emti' not in vocabulary"
"word 'nafila' not in vocabulary"
"word '759' not in vocabulary"
"word 'sgpti' not in vocabulary"
"word '16756' not in vocabulary"
"word '1449821' not in vocabulary"
"word '84b3331c' not in vocabulary"
"word 'b462' not in vocabulary"
"word '4ff8' not in vocabulary"
"word '8ad1' not in vocabulary"
"word 'e2aaa8c1719d' not in vocabulary"
"word 'csbe' not in vocabulary"
"word '748' not in vocabulary"
"word '4650' not in vocabulary"
"word '1893' not in vocabulary"
"word '9145' not in vocabulary"
"word '1120' not in vocabulary"
"word 'institutolula' not in vocabulary"
"word 'vanarousseff' not in 

"word '09200000428201646' not in vocabulary"
"word 'ifsudestemg' not in vocabulary"
"word '002029' not in vocabulary"
"word 'seamp' not in vocabulary"
"word '23480015572201528' not in vocabulary"
"word 'cgpncd' not in vocabulary"
"word '0324772' not in vocabulary"
"word 'n°18' not in vocabulary"
"word '72550000150201673' not in vocabulary"
"word '022157' not in vocabulary"
"word '50650001373201778' not in vocabulary"
"word 'cocqg' not in vocabulary"
"word 'tarjou' not in vocabulary"
"word '23480006529201652' not in vocabulary"
"word 'naconcessão' not in vocabulary"
"word 'naalteração' not in vocabulary"
"word '181do' not in vocabulary"
"word '23480011262201534' not in vocabulary"
"word 'marinaem' not in vocabulary"
"word 'americanade' not in vocabulary"
"word '2575' not in vocabulary"
"word 'incido' not in vocabulary"
"word '04096431000154' not in vocabulary"
"word 'manoelcsg' not in vocabulary"
"word 'norking' not in vocabulary"
"word '993' not in vocabulary"
"word '09200000694201750'

"word 'zechlinski' not in vocabulary"
"word '000407' not in vocabulary"
"word '1728' not in vocabulary"
"word 'eneyas' not in vocabulary"
"word '00700000250201628' not in vocabulary"
"word 'ecompensacao' not in vocabulary"
"word 'raskopf' not in vocabulary"
"word 'schwaizer' not in vocabulary"
"word 'ambientebrasil' not in vocabulary"
"word '0200' not in vocabulary"
"word '610021' not in vocabulary"
"word '99901001216201591' not in vocabulary"
"word '“procedimento' not in vocabulary"
"word '22460' not in vocabulary"
"word '4500' not in vocabulary"
"word '51020' not in vocabulary"
"word '3198' not in vocabulary"
"word '1280' not in vocabulary"
"word '99905000127201597' not in vocabulary"
"word '1598' not in vocabulary"
"word 'naexposição' not in vocabulary"
"word '23480017389201648' not in vocabulary"
"word 'faufconcursos' not in vocabulary"
"word '5391' not in vocabulary"
"word '23480019579201608' not in vocabulary"
"word '1208' not in vocabulary"
"word '897' not in vocabulary"
"word '

"word 'guichevirtual' not in vocabulary"
"word 'naeventual' not in vocabulary"
"word 'capec' not in vocabulary"
"word 'ppgdh' not in vocabulary"
"word 'terlúcia' not in vocabulary"
"word 'ppgcj' not in vocabulary"
"word '001139' not in vocabulary"
"word 'deedu' not in vocabulary"
"word 'erisvaldosanto' not in vocabulary"
"word '3557' not in vocabulary"
"word '9400' not in vocabulary"
"word 'espelhogrupo' not in vocabulary"
"word '5256428053644095' not in vocabulary"
"word '16853007895201688' not in vocabulary"
"word '99903000602201715' not in vocabulary"
"word 'marisson' not in vocabulary"
"word '3747' not in vocabulary"
"word '7501' not in vocabulary"
"word '662' not in vocabulary"
"word 'tatianamingote' not in vocabulary"
"word 'dnn' not in vocabulary"
"word 'dnn12959' not in vocabulary"
"word 'nasof' not in vocabulary"
"word 'naseges' not in vocabulary"
"word 'nº158' not in vocabulary"
"word 'cgsh' not in vocabulary"
"word '001405' not in vocabulary"
"word '23480021704201631' not in

"word '6288' not in vocabulary"
"word '1804' not in vocabulary"
"word '1807' not in vocabulary"
"word '1809' not in vocabulary"
"word '000605' not in vocabulary"
"word '23480013477201590' not in vocabulary"
"word '000892' not in vocabulary"
"word '“ufabc”' not in vocabulary"
"word '000227' not in vocabulary"
"word 'joiciane' not in vocabulary"
"word '09200000664201581' not in vocabulary"
"word '23480019052201675' not in vocabulary"
"word 'na3ª' not in vocabulary"
"word 'dijlc' not in vocabulary"
"word 'prfn' not in vocabulary"
"word 'icene' not in vocabulary"
"word 'icte' not in vocabulary"
"word 'ielachs' not in vocabulary"
"word '010521' not in vocabulary"
"word 'boletimdesempenh' not in vocabulary"
"word 'clislenio' not in vocabulary"
"word 'lucianamartha' not in vocabulary"
"word '7156' not in vocabulary"
"word '7170' not in vocabulary"
"word '08850003140201614' not in vocabulary"
"word '701400' not in vocabulary"
"word 'luzivan' not in vocabulary"
"word '53850000999201537' not in 

"word 'tássila' not in vocabulary"
"word 'segundosolicitado' not in vocabulary"
"word 'naproppg' not in vocabulary"
"word '53850001696201531' not in vocabulary"
"word 'radiozum' not in vocabulary"
"word '2131' not in vocabulary"
"word '3561' not in vocabulary"
"word '1508' not in vocabulary"
"word 'internadisciplinando' not in vocabulary"
"word '00700000353201698' not in vocabulary"
"word '23480000400201631' not in vocabulary"
"word '1950' not in vocabulary"
"word 'coreg' not in vocabulary"
"word '999280003022017' not in vocabulary"
"word '60025' not in vocabulary"
"word '360770' not in vocabulary"
"word '58750000052201730' not in vocabulary"
"word '001238' not in vocabulary"
"word '003216' not in vocabulary"
"word '99923000522201632' not in vocabulary"
"word 'depan' not in vocabulary"
"word '001164' not in vocabulary"
"word 'falepr2' not in vocabulary"
"word 'suerlene' not in vocabulary"
"word '58750000086201724' not in vocabulary"
"word 'cbclubes' not in vocabulary"
"word '0139000033

"word '1ªrm' not in vocabulary"
"word '20080' not in vocabulary"
"word 'analucia' not in vocabulary"
"word 'diurnade' not in vocabulary"
"word 'cumulos' not in vocabulary"
"word 'dapex' not in vocabulary"
"word '23480017818201504' not in vocabulary"
"word '23205' not in vocabulary"
"word '8º”' not in vocabulary"
"word '005156' not in vocabulary"
"word 'canaranamt' not in vocabulary"
"word '00066' not in vocabulary"
"word '3478' not in vocabulary"
"word '2431' not in vocabulary"
"word '2346' not in vocabulary"
"word '1871' not in vocabulary"
"word 'gilcemir' not in vocabulary"
"word 'internani' not in vocabulary"
"word '002604' not in vocabulary"
"word '006806' not in vocabulary"
"word '000786' not in vocabulary"
"word '000971' not in vocabulary"
"word '22º' not in vocabulary"
"word 'processosseletivos' not in vocabulary"
"word '85244' not in vocabulary"
"word '730a236dcb' not in vocabulary"
"word '26420' not in vocabulary"
"word '158127' not in vocabulary"
"word 'naug' not in vocabular

"word '11h00min' not in vocabulary"
"word '15h00min' not in vocabulary"
"word 'diplad' not in vocabulary"
"word '6730' not in vocabulary"
"word '23480007225201793' not in vocabulary"
"word '000138' not in vocabulary"
"word 'gestaopessoas' not in vocabulary"
"word 'sitepages' not in vocabulary"
"word 'actnacional' not in vocabulary"
"word 'djpp' not in vocabulary"
"word 'djp' not in vocabulary"
"word '1540' not in vocabulary"
"word '2848' not in vocabulary"
"word 'medicinamúsica' not in vocabulary"
"word 'pmbqbm' not in vocabulary"
"word 'entednder' not in vocabulary"
"word 'pdido' not in vocabulary"
"word 'inexitoso' not in vocabulary"
"word '6139' not in vocabulary"
"word '6138' not in vocabulary"
"word 'gereh' not in vocabulary"
"word 'censoagro' not in vocabulary"
"word '001271' not in vocabulary"
"word 'concernentemente' not in vocabulary"
"word 'artigos15' not in vocabulary"
"word 'napetrobras' not in vocabulary"
"word '969377' not in vocabulary"
"word '969411' not in vocabulary"


"word 'portconjuntarfbpgfn17512014' not in vocabulary"
"word 'transbrasilianaencontra' not in vocabulary"
"word '9277' not in vocabulary"
"word 'giustinagerente' not in vocabulary"
"word 'internanaagência' not in vocabulary"
"word '\xa0acerca' not in vocabulary"
"word 'piscinanão' not in vocabulary"
"word 'piscinapiso' not in vocabulary"
"word 'piscinaesta' not in vocabulary"
"word 'couni' not in vocabulary"
"word '08850002556201526' not in vocabulary"
"word 'relatorioufv' not in vocabulary"
"word 'tefefones' not in vocabulary"
"word '1183' not in vocabulary"
"word 'acordialmente' not in vocabulary"
"word '60502001652201799' not in vocabulary"
"word '16853001438201761' not in vocabulary"
"word '00123000015' not in vocabulary"
"word 'aidf' not in vocabulary"
"word '09200000339201608' not in vocabulary"
"word 'nº119' not in vocabulary"
"word 'nº139' not in vocabulary"
"word '675' not in vocabulary"
"word 'tcis' not in vocabulary"
"word '011056' not in vocabulary"
"word '011057' not in vo

"word '3338' not in vocabulary"
"word '6327' not in vocabulary"
"word '23480012096201755' not in vocabulary"
"word 'cgpla' not in vocabulary"
"word '99903000278201727' not in vocabulary"
"word '23480014359201507' not in vocabulary"
"word 'neldson' not in vocabulary"
"word '004730' not in vocabulary"
"word '004731' not in vocabulary"
"word '004843' not in vocabulary"
"word '5356' not in vocabulary"
"word '004526' not in vocabulary"
"word '003694' not in vocabulary"
"word '005413' not in vocabulary"
"word 'napáginadas' not in vocabulary"
"word 'ainformacao' not in vocabulary"
"word 'anaorgaosconselhos' not in vocabulary"
"word '03000003113' not in vocabulary"
"word 'catarinaleilão' not in vocabulary"
"word 'sfi0026' not in vocabulary"
"word '0026' not in vocabulary"
"word '1ºleilão' not in vocabulary"
"word '001227' not in vocabulary"
"word 'formulariologin' not in vocabulary"
"word '99925000008201504' not in vocabulary"
"word 'rj001160' not in vocabulary"
"word '33m²' not in vocabulary"

"word 'intranetsistemas' not in vocabulary"
"word 'cad01' not in vocabulary"
"word '58750000271201638' not in vocabulary"
"word 'gtsa' not in vocabulary"
"word 'refabricadas' not in vocabulary"
"word '02680000012201895' not in vocabulary"
"word '000009' not in vocabulary"
"word 'catarinacom' not in vocabulary"
"word 's52' not in vocabulary"
"word 'm51' not in vocabulary"
"word 'intervert' not in vocabulary"
"word 'k40' not in vocabulary"
"word 'f31' not in vocabulary"
"word 'ciatica' not in vocabulary"
"word 'm79' not in vocabulary"
"word 's92' not in vocabulary"
"word 's42' not in vocabulary"
"word 'k80' not in vocabulary"
"word 'colelitiase' not in vocabulary"
"word 'psicot' not in vocabulary"
"word '0027669' not in vocabulary"
"word '0027672' not in vocabulary"
"word 'dambrósio' not in vocabulary"
"word 'completo”' not in vocabulary"
"word 'cpgs' not in vocabulary"
"word 'mês”' not in vocabulary"
"word 'nacoordenadoria' not in vocabulary"
"word '20h30min' not in vocabulary"
"word '0

"word 'naago' not in vocabulary"
"word '7448' not in vocabulary"
"word '1701629998' not in vocabulary"
"word '00700000520201609' not in vocabulary"
"word '0826' not in vocabulary"
"word 'hiscreweb' not in vocabulary"
"word '20ocupados' not in vocabulary"
"word '20vagos' not in vocabulary"
"word '476279adi' not in vocabulary"
"word '3599' not in vocabulary"
"word '25483' not in vocabulary"
"word '1863' not in vocabulary"
"word '3768' not in vocabulary"
"word '3104' not in vocabulary"
"word '3437' not in vocabulary"
"word '3964' not in vocabulary"
"word '3510' not in vocabulary"
"word '3330' not in vocabulary"
"word '3772' not in vocabulary"
"word '4048' not in vocabulary"
"word '570177' not in vocabulary"
"word '434059' not in vocabulary"
"word '2649' not in vocabulary"
"word '3388' not in vocabulary"
"word '4033' not in vocabulary"
"word '27516' not in vocabulary"
"word '4086' not in vocabulary"
"word '3999' not in vocabulary"
"word '543974' not in vocabulary"
"word '578543' not in voc

In [17]:
# Define model
def train_model(input_train, input_test, label_train, label_test,
                lstm_size=128, dropout=0.2, rec_dropout=0.2, lr=0.005, epochs=50, att_heads=4, max_length=128, 
                vocab_size=None, embed_size=None, emb_trainable=False, batch=128, early_stopping=5,
                save_dir="D:/resultados/checkpoins_solicitacao_keras_mh_att/", best_predefined_f1=0.390):

    # Time now
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Log
    log_dir = save_dir + now
    
    # Model Saver    
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    
    log_file = open(os.path.join(log_dir,"log.txt"), mode="a")
    
    # Save Params
    params = {
        'lstm_size': lstm_size,
        'dropout': dropout,
        'rec_dropout': rec_dropout,
        'lr': lr,
        'epochs': epochs,
        'att_heads': att_heads,
        'max_length': max_length,
        'vocab_size': vocab_size,
        'embed_size': embed_size,
        'emb_trainable': emb_trainable,
        'batch': batch,
        'early_stopping': early_stopping,
        'log_dir': log_dir,
        'best_predefined_f1': best_predefined_f1}
    
    # Saving Parameters
    with open(os.path.join(log_dir, 'params.txt'),'a') as f:
        f.write('\n\n' + ('#'*60))
        f.write('\nParameters:\n')
        f.write('now: ' + str(now))
        f.write('\n' + dumps(params) + '\n')
    
    # input
    inp = Input(shape=(max_length, ))

    # Embedding layer - https://keras.io/layers/embeddings/
    embedding_layer = Embedding(vocab_size,
                                embed_size,                                
                                weights=[embedding_matrix],
                                input_length=max_length,
                                trainable=emb_trainable,
                                name='Embedding')(inp)

    # Bidirectional Layer
    bilstm_layer = Bidirectional(LSTM(
                        units=lstm_size,
                        return_sequences=True,
                        dropout=dropout,
                        recurrent_dropout=rec_dropout,
                        name='LSTM'))(embedding_layer)    

    # MultiHead-Attention Layer
    #https://pypi.org/project/keras-multi-head/
    multiHead_att_layer = MultiHeadAttention(head_num=att_heads, name='Multi-Head-Attention')(bilstm_layer)

    dropout_intermed_layer = Dropout(0.5)(multiHead_att_layer)

    # # Flatten
    flatten_layer = Flatten(name='Flatten')(dropout_intermed_layer)

    dense_intermed_layer = Dense(128, activation='relu')(flatten_layer)
    dropout_intermed_2_layer = Dropout(dropout)(dense_intermed_layer)

    # # # # Dense Layer
    if binary:
        dense_layer = Dense(1, activation='sigmoid')(dense_intermed_layer)    
    else:
        dense_layer = Dense(num_labels, activation='softmax')(dense_intermed_layer)    
    
    model = Model(inputs=inp, outputs=dense_layer)
    # model.summary()
    
    # Compile
    model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy', precision_m, recall_m, f1_m])
    
    # callbacks
    es_callback = EarlyStopping(monitor='val_loss', patience=early_stopping, verbose=1, mode='min')
    
    # Fitting Model
    model.fit(input_train,
              label_train,
              epochs=epochs,
              batch_size=batch,
              validation_data=(input_test, label_test),
              verbose=0,
              callbacks=[es_callback])
    
    # PLOT LOSS
    plt.title('Loss')
    plt.plot(model.history.history['loss'], label='train')
    plt.plot(model.history.history['val_loss'], label='test')
    plt.legend()
    #plt.show();
    plt.savefig(os.path.join(log_dir,'Loss.png'))
    plt.close()
    
    # Classification
    y_pred = model.predict(input_test, batch_size=batch, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)
    
    if not binary:
        label_test = np.argmax(label_test.values, axis=1)
    
    # Metrics
    f1 = f1_score(label_test, y_pred_bool, average='weighted')
    print(f"Best Test F1-Score: {f1:.3f}")
    
    print("#"*60 + '\n', file=log_file)
    print(classification_report(label_test, y_pred_bool), file=log_file)
    print("#"*60+ '\n', file=log_file)
    
    # Flush log file
    log_file.flush()
    log_file.close()
    
    # Save final result
    with open(os.path.join(log_dir[:-16], 'output.txt'),'a') as f:
        f.write('\n\n')
        f.write(log_dir)
        f.write('\n')
        f.write(f"Best Test F1-Score: {f1:.3f}")
        
    # save model and architecture to single file
    if f1 > best_predefined_f1:
        model.save(os.path.join(log_dir, "model.h5"))
        print("Saved model to disk")

In [18]:
# Model Params
lstm_size_list = [256, 512]
dropout_list = [0.25, 0.5]
rec_dropout_list = [0.1, 0.25, 0.5]
lr_list = [1e-3, 5e-4, 1e-4, 5e-5, 5e-6]

all_params = [lstm_size_list] + [dropout_list] + [rec_dropout_list] + [lr_list]

for each in itertools.product(*all_params):    
    lstm_size, dropout, rec_dropout, lr = each
    
    # Params
    print('lstm_size: ' + str(lstm_size))
    print('\tdropout: ' + str(dropout))
    print('\trec_dropout: ' + str(rec_dropout))
    print('\tlr: ' + str(lr))
    
    # train
    train_model(input_train, input_test, label_train, label_test, lstm_size, dropout, rec_dropout, lr, epochs=50,
                att_heads=8, max_length=max_length, vocab_size=vocab_size, embed_size=embed_size, emb_trainable=False, batch=128,
                early_stopping=5, save_dir="D:/Outputs_Mestrado/resultados_Atendimento/checkpoins_resposta_keras_mh_att/",
                best_predefined_f1=0.42)

lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 0.001
Epoch 00009: early stopping
Best Test F1-Score: 0.357
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 0.0005
Epoch 00013: early stopping
Best Test F1-Score: 0.393
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 0.0001
Epoch 00010: early stopping
Best Test F1-Score: 0.371
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 5e-05
Epoch 00014: early stopping
Best Test F1-Score: 0.365
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.1
	lr: 5e-06
Epoch 00019: early stopping
Best Test F1-Score: 0.372
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 0.001
Epoch 00016: early stopping
Best Test F1-Score: 0.389
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 0.0005
Epoch 00013: early stopping
Best Test F1-Score: 0.411
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 0.0001
Epoch 00014: early stopping
Best Test F1-Score: 0.392
lstm_size: 256
	dropout: 0.25
	rec_dropout: 0.25
	lr: 5e-05
Epoch 00015: early stopping
B

ResourceExhaustedError:  OOM when allocating tensor with shape[65536,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradients_11/dense_23/MatMul_grad/MatMul_1 (defined at C:\Users\arthu\Anaconda3\envs\keras_env\lib\site-packages\keras\backend\tensorflow_backend.py:3009) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_keras_scratch_graph_193698]

Function call stack:
keras_scratch_graph
