## Imports

In [1]:
## Imports
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
plt.style.use("ggplot")

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec

from keras.models import Sequential
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Embedding, TimeDistributed, Bidirectional,GlobalMaxPooling1D


from tcdf_text_classification.iob_transformer import iob_transformer


from gensim.models import Word2Vec


from livelossplot.tf_keras import PlotLossesCallback


from seqeval.metrics import f1_score, classification_report, precision_score, recall_score

from plot_keras_history import plot_history

from tf2crf import CRF, ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from keras.optimizers import Adam

from sklearn.model_selection import KFold

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

[nltk_data] Downloading package punkt to /home/alilim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2022-11-13 22:57:15.822639: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-13 22:57:16.104699: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-13 22:57:16.182120: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-13 22:57:17.5409

## Dados

In [2]:
import pandas as pd

df = pd.read_csv('DODFCorpus_contratos_licitacoes_v2.csv')

df = df.drop(['Unnamed: 0','Unnamed: 0.1'], axis =1)

#regex pra resolver esse tipo de problema -> Processo:0...
df['texto'] = df['texto'].str.replace(r'([A-Za-z]:)[0-9]', r'\1 ', regex=True)

In [17]:
data = df.query("tipo_rel == 'REL_ANUL_REVOG_LICITACAO'")

In [18]:
data.tipo_rel.unique()

array(['REL_ANUL_REVOG_LICITACAO'], dtype=object)

In [5]:
max_length = 400

### IOB, transformação dos dados

In [6]:
def remove_wrong_tags(label_list):
  for label in label_list:
    for idx,w in enumerate(label):
      if w in ['B-11','B-12','B-50']:
        label[idx] = 'O'


def get_uniquev(acts,labels):

  #salvando todas as palavras do corpus sem repetição
  words = set()

  for act in acts:
    for word in act:
      words.add(word)
  #convertendo o set em uma lista
  words = list(words)

  words.append("ENDPAD")
  words.append("UNK")

  words_amt = len(words)

  tags = set()

  for label in labels:
    for tag in label:
      tags.add(tag)

  tags = list(tags)
  tags_amt = len(tags)

  return words, tags, words_amt, tags_amt


def get_dicts(words, tags):

  lab_enc = LabelEncoder()

  lab_enc.fit(words)
  words_i = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))

  i_words = {}

  for key in words_i:
    i_words[words_i[key]] = key

  lab_enc = LabelEncoder()

  lab_enc.fit(tags)
  tags_i = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))


  i_tags = {}

  for key in tags_i:
    i_tags[tags_i[key]] = key


  return words_i, i_words, tags_i, i_tags


def transform_data(x, y, tags_i, words_i):
  X,Y = [],[]

  for act in x:
    aux = []
    for word in act:
      aux.append(words_i[word])
    X.append(aux)

  for label in y:
    aux = []
    for word in label:
      aux.append(tags_i[word])
    Y.append(aux)

  return X,Y



def handle_data(dataf):

  data_info = {}

  iob = iob_transformer('id_ato', 'texto', 'tipo_ent', keep_punctuation=True, return_df=False)
  iob_data = iob_transformer('id_ato', 'texto', 'tipo_ent', keep_punctuation=True, return_df=True)

  acts, labels = iob.transform(data)

  remove_wrong_tags(labels)
  
  df_iob = iob_data.transform(dataf)


  df_iob.loc[df_iob.Word == 'B-11','Tag']='O'
  df_iob.loc[df_iob.Word == 'B-12','Tag']='O'
  df_iob.loc[df_iob.Word == 'B-50','Tag']='O'

  
  data_info['words'], data_info['tags'], data_info['words_amt'], data_info['tags_amt'] = get_uniquev(acts, labels)


  vocab = {}

  for i in range(0,len(acts)):
    for word in acts[i]:
      if word.lower() not in vocab:
        vocab[word.lower()] = 1
      else:
        vocab[word.lower()]+=1

  data_info['words_i'], data_info['i_words'], data_info['tags_i'], data_info['i_tags'] = get_dicts(data_info['words'],data_info['tags'])

  inputs,targets = transform_data(acts,labels, data_info['tags_i'], data_info['words_i'])

  inputs = pad_sequences(maxlen=max_length, sequences=inputs, padding="post", value=data_info['words_i']['ENDPAD'])
  targets = pad_sequences(maxlen=max_length, sequences=targets, padding="post", value=data_info['tags_i']["O"])


  return inputs, targets, data_info
  

In [7]:
def convert_values(index_array,y_test, i_tags):
  pred_tags = []
  real_tags = []

  for act in index_array:
    act_tags = []
    for w in act:
      act_tags.append(i_tags[w])
    pred_tags.append(act_tags)

  for ato in y_test:
    tags_ato = []
    for palavra in ato:
      tags_ato.append(i_tags[palavra])
    real_tags.append(tags_ato)

  return real_tags, pred_tags

## Funções auxiliares

In [8]:
## Funções auxiliares
def calc_f1(data_info):    

    def get_f1(y_true, y_pred):
        y_pred = np.argmax(y_pred, axis=-1)
        real_tags,pred_tags=convert_values(y_pred,y_true.numpy(),data_info['i_tags'])

        f1 = f1_score(real_tags,pred_tags)

        return f1

    return get_f1

def plotting(h,name):
  plt.plot(h.history['get_f1'])
  plt.plot(h.history['val_get_f1'])
  plt.title('model f1-scores')
  plt.ylabel('f1-score')
  plt.xlabel('epoch')
  plt.legend(['training set','validation set'], loc='upper left')

  plt.savefig('grafs/'+ name + '.png', bbox_inches='tight')

## Kfold

In [8]:
kfold = KFold(n_splits=5, shuffle=True)

## Word2vec-LSTM

### Word2vec

In [9]:
def word2vec(data_info):

    df2 =  pd.read_csv('dodf_atos_pessoal_final_version.csv')


    l = [df2["texto_rel"]]
    headers = ["texto"]
    df_emb = pd.concat(l, axis=1, keys=headers)

    df_emb.drop_duplicates(subset=['texto'],inplace=True)

    t_aux = list(df_emb["texto"])
    text = []

    for sent in t_aux:
        ap = sent.lower()
        ap = word_tokenize(ap)
        text.append(ap)

    
    model_emb = Word2Vec(min_count=1, window=5)
    model_emb.build_vocab(text)  
    model_emb.train(text, total_examples=model_emb.corpus_count, epochs=model_emb.epochs)

    word_vectors = model_emb.wv
    emb_dim = len(word_vectors[0])

    emb_mtx = np.zeros((data_info['words_amt'], emb_dim))


    for word, i in data_info['words_i'].items():
        if word in word_vectors:
            emb_vec = word_vectors[word]
            emb_mtx[i] = emb_vec
        else:
            emb_mtx[i] = np.random.normal(0,1,emb_dim)

    
    import keras
    embedding_layer = Embedding(data_info['words_amt'],
                            emb_dim,
                            embeddings_initializer=keras.initializers.Constant(emb_mtx),
                            input_length=max_length,
                            trainable=False)


    return embedding_layer

### LSTM

In [10]:
def lstm(inputs, targets, data_info):

    embedding_layer = word2vec(data_info)

    acc = []
    loss = []
    f1 = []
    reports = []

    fold_no = 1


    for train, test in kfold.split(inputs, targets):

        #Modelo
        w2v_lstm  = Sequential()
        w2v_lstm.add(embedding_layer)
        w2v_lstm.add(LSTM(100, return_sequences=True))
        w2v_lstm.add(Dropout(0.5))
        w2v_lstm.add(Dense(data_info['tags_amt'], activation="softmax"))
        w2v_lstm.summary()

        adam = Adam(learning_rate=0.0095)

        w2v_lstm.compile(optimizer=adam,loss="sparse_categorical_crossentropy",metrics=["accuracy"])

        early_stopping = EarlyStopping(monitor='accuracy', min_delta=0, patience=5, verbose=0, mode='max', baseline=None, restore_best_weights=False)
        callbacks = [early_stopping]


        history = w2v_lstm.fit(inputs[train],targets[train],batch_size=12, epochs=25, callbacks=callbacks)


        scores = w2v_lstm.evaluate(inputs[test], targets[test], verbose=0)
        print(f'Score for fold {fold_no}: {w2v_lstm.metrics_names[0]} of {scores[0]}; {w2v_lstm.metrics_names[1]} of {scores[1]*100}%')
        acc.append(scores[1] * 100)
        loss.append(scores[0])


        predictions = w2v_lstm.predict(inputs[test], verbose=0)
        predictions = np.argmax(predictions, axis=-1)
        real_tags, pred_tags = convert_values(predictions,targets[test], data_info['i_tags'])
        f1.append(f1_score(real_tags, pred_tags))
        
        reports.append(classification_report(real_tags,pred_tags))

        model_name = 'models/lstm_f' + str(fold_no) + '.h5'

        w2v_lstm.save(model_name)

        fold_no = fold_no + 1

    return acc, loss, f1, reports

## CNN-LSTM

In [19]:
def cnnlstm(inputs,targets,data_info):    
    
    acc = []
    loss = []
    f1 = []
    reports = []

    fold_no = 1

    for train, test in kfold.split(inputs, targets):

        #Modelo
        cnn_lstm = Sequential()
        cnn_lstm.add(Embedding(input_dim=data_info['words_amt']+1, output_dim=50, input_length=max_length))
        cnn_lstm.add(Conv1D(filters=data_info['tags_amt'], kernel_size=3, padding='same', activation='relu'))
        cnn_lstm.add(LSTM(100, return_sequences=True))
        cnn_lstm.add(Dropout(0.5))
        cnn_lstm.add(Dense(data_info['tags_amt'], activation='sigmoid'))

        adam = Adam(learning_rate=0.009)

        cnn_lstm.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

        early_stopping = EarlyStopping(monitor='accuracy', min_delta=0, patience=5, verbose=0, mode='max', baseline=None, restore_best_weights=False)
        callbacks = [early_stopping]

        history = cnn_lstm.fit(inputs[train],targets[train],batch_size=40, epochs=20, callbacks=callbacks)


        scores = cnn_lstm.evaluate(inputs[test], targets[test], verbose=0)
        print(f'Score for fold {fold_no}: {cnn_lstm.metrics_names[0]} of {scores[0]}; {cnn_lstm.metrics_names[1]} of {scores[1]*100}%')
        acc.append(scores[1] * 100)
        loss.append(scores[0])


        predictions = cnn_lstm.predict(inputs[test], verbose=0)
        predictions = np.argmax(predictions, axis=-1)
        real_tags, pred_tags = convert_values(predictions,targets[test],data_info['i_tags'])
        f1.append(f1_score(real_tags, pred_tags))
        
        reports.append(classification_report(real_tags,pred_tags))

        model_name = 'models/cnnlstm_f' + str(fold_no) + '.h5'

        cnn_lstm.save(model_name)

        fold_no = fold_no + 1

    return acc, loss, f1, reports

## CNN-BiLSTM

In [20]:
def cnnbilstm(inputs,targets,data_info):    
    
    acc = []
    loss = []
    f1 = []
    reports = []

    fold_no = 1

    for train, test in kfold.split(inputs, targets):

        #Modelo
        cnn_bilstm = Sequential()
        cnn_bilstm.add(Embedding(input_dim=data_info['words_amt']+1, output_dim=50, input_length=max_length))
        cnn_bilstm.add(Conv1D(filters=data_info['tags_amt'], kernel_size=3, padding='same', activation='relu'))
        cnn_bilstm.add(Bidirectional(LSTM(100, return_sequences=True)))
        cnn_bilstm.add(Dropout(0.5))
        cnn_bilstm.add(Dense(data_info['tags_amt'], activation='sigmoid'))

        adam = Adam(learning_rate=0.009)

        cnn_bilstm.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

        early_stopping = EarlyStopping(monitor='accuracy', min_delta=0, patience=5, verbose=0, mode='max', baseline=None, restore_best_weights=False)
        callbacks = [early_stopping]

        history = cnn_bilstm.fit(inputs[train],targets[train],batch_size=30, epochs=20, callbacks=callbacks)


        scores = cnn_bilstm.evaluate(inputs[test], targets[test], verbose=0)
        print(f'Score for fold {fold_no}: {cnn_bilstm.metrics_names[0]} of {scores[0]}; {cnn_bilstm.metrics_names[1]} of {scores[1]*100}%')
        acc.append(scores[1] * 100)
        loss.append(scores[0])


        predictions = cnn_bilstm.predict(inputs[test], verbose=0)
        predictions = np.argmax(predictions, axis=-1)
        real_tags, pred_tags = convert_values(predictions,targets[test],data_info['i_tags'])
        f1.append(f1_score(real_tags, pred_tags))
        
        reports.append(classification_report(real_tags,pred_tags))


        model_name = 'models/cnnbilstm_f' + str(fold_no) + '.h5'


        cnn_bilstm.save(model_name)

        fold_no = fold_no + 1

    return acc, loss, f1, reports

## Rodando o experimento para cada tipo de ato

In [11]:
x, y, d = handle_data(data)

In [19]:
lstm_acc, lstm_loss, lstm_f1, lstm_reports = lstm(x,y,d)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 400, 100)          2423800   
                                                                 
 lstm_5 (LSTM)               (None, 400, 100)          80400     
                                                                 
 dropout_5 (Dropout)         (None, 400, 100)          0         
                                                                 
 dense_5 (Dense)             (None, 400, 35)           3535      
                                                                 
Total params: 2,507,735
Trainable params: 83,935
Non-trainable params: 2,423,800
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25


In [20]:
lstm_f1

[0.7355920363255328,
 0.7483482091109308,
 0.7875824425744826,
 0.7920129270544782,
 0.8211873444720938]

In [23]:
np.mean(lstm_f1)

0.7769445919075035

In [24]:
np.std(lstm_f1)

0.031065822724061762

In [19]:
cnnlstm_acc, cnnlstm_loss, cnnlstm_f1, cnnlstm_reports = cnnlstm(x,y,d)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 1: loss of 0.22698235511779785; accuracy of 95.44146060943604%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 2: loss of 0.21112366020679474; accuracy of 95.62520384788513%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 3: loss of 0.2217930257320404; accuracy of 95.81219553947449%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/

In [26]:
np.mean(cnnlstm_f1)

NameError: name 'cnnlstm_f1' is not defined

In [28]:
np.std(cnnlstm_f1)

0.01345543566426264

In [20]:
cnnlstm_f1

[0.5937359777438751,
 0.566768039811999,
 0.5619954648526078,
 0.5679884120948759,
 0.5916870415647922]

In [21]:
cnnbilstm_acc, cnnbilstm_loss, cnnbilstm_f1, cnnbilstm_reports = cnnbilstm(x,y,d)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 1: loss of 0.20901209115982056; accuracy of 95.92927098274231%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 2: loss of 0.13833437860012054; accuracy of 97.00610041618347%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Score for fold 3: loss of 0.19301056861877441; accuracy of 96.32073044776917%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7

In [22]:
cnnbilstm_f1

[0.7226158968667293,
 0.7424922333448394,
 0.7192607170360229,
 0.6994845360824743,
 0.7120596205962059]

In [26]:
np.mean(cnnbilstm_f1)

0.7191826007852544

In [29]:
np.std(cnnbilstm_f1)

0.014098816929111881