In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
n_split = 4

In [3]:
import pickle

In [4]:
with open('/content/drive/My Drive/Data Master/X_train_final', 'rb') as file:
    X_train_balanced = pickle.load(file)

with open('/content/drive/My Drive/Data Master/X_test_final', 'rb') as file:
    X_test = pickle.load(file)

with open('/content/drive/My Drive/Data Master/word_index_final', 'rb') as file:
    word_index = pickle.load(file)

In [5]:
inv_word_index = {ix : w for w, ix in word_index.items()}

In [6]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 8.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 49.6MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 54.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |███

In [7]:
import pandas as pd
import numpy as np
import itertools

import torch
import torch.nn as nn
import transformers
import torch.utils.data as tdata
import torch.optim as optim

import tqdm

In [8]:
import transformers

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
fine_tuned_model_dir = '/content/drive/My Drive/Colab Notebooks/Torch/_FineTuningModels/SentenceClassification/model_save/bert_model_sentence_classification_tail_tuned_split_' + str(n_split) + '/'

In [11]:
bert_tokenizer = transformers.BertTokenizer.from_pretrained(fine_tuned_model_dir)
bert_config = transformers.BertConfig.from_pretrained(fine_tuned_model_dir, output_hidden_states=True)
bert_model = transformers.BertModel.from_pretrained(fine_tuned_model_dir, config=bert_config).to(device)

In [12]:
def spilt_text(sent, sent_size = 350, overlapping_size = 100):
  res = []
  n_chunks = (len(sent) // sent_size) + 1
  for i in range(n_chunks):
    res.append(sent[i * sent_size : i * sent_size + (sent_size + overlapping_size)])
  return res

In [13]:
data_train_balanced_chunked = [(i, spilt_text([inv_word_index[ix] for ix in X_train_balanced[i]])) for i, d in enumerate(X_train_balanced)]
data_test_chunked = [(i, spilt_text([inv_word_index[ix] for ix in X_test[i]])) for i, d in enumerate(X_test)]

In [14]:
data_train_balanced_chunked_splitted = [(i, [" ".join(subsent) for subsent in chunked_sent]) for i, chunked_sent in data_train_balanced_chunked]
data_test_chunked_splitted = [(i, [" ".join(subsent) for subsent in chunked_sent]) for i, chunked_sent in data_test_chunked]

In [15]:
df_train_balanced_chunked_splitted = pd.DataFrame(
    list(itertools.chain.from_iterable([[[doc, c] for c in chunks] for doc, chunks in data_train_balanced_chunked_splitted])),
    columns=['doc', 'chunk'])

df_test_chunked_splitted = pd.DataFrame(
    list(itertools.chain.from_iterable([[[doc, c] for c in chunks] for doc, chunks in data_test_chunked_splitted])),
    columns=['doc', 'chunk'])

In [16]:
docs_train = df_train_balanced_chunked_splitted.doc.values
sentences_train = df_train_balanced_chunked_splitted.chunk.values

docs_test = df_test_chunked_splitted.doc.values
sentences_test = df_test_chunked_splitted.chunk.values

In [17]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_train = []
attention_masks_train = []

# For every sentence...
for sent in tqdm.notebook.tqdm(sentences_train):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = bert_tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation='longest_first'
                   )
    
    # Add the encoded sentence to the list.    
    input_ids_train.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_train.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences_train[10])
print('Token IDs:', input_ids_train[10])

HBox(children=(FloatProgress(value=0.0, max=33528.0), HTML(value='')))


Original:  foram transferidas para gss em 07 de fevereiro de 2014 a operação não foi sujeita à aprovação do cade em razão de não terem sido atingidos os critérios de faturamento previstos na lei no 12 529 2011 ato de concentração nº 08700 001301 2016 41 envolvendo serrana águas ltda serrana e aegea saneamento e participações s a a operação diz respeito à aquisição pela aegea de participação de 49 do capital social da empresa águas de penha saneamento spe ltda águas de penha anteriormente detida pela serrana operação aprovada sem restrições em 11 03 2016 11 8 informe todas as atividades econômicas desempenhadas pelas partes diretamente envolvidas na operação no brasil indicando o faturamento bruto obtido com cada uma das atividades no ano fiscal anterior ao da apresentação da notificação classifique as segundo a cnae 2 0 a 7 dígitos ou versão mais atual segue abaixo a lista das atividades econômicas desempenhadas pela aegea no brasil em 2015 com os faturamentos brutos obtidos com c

In [18]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = []
attention_masks_test = []

# For every sentence...
for sent in tqdm.notebook.tqdm(sentences_test):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = bert_tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation='longest_first'
                   )
    
    # Add the encoded sentence to the list.    
    input_ids_test.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_test.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences_test[10])
print('Token IDs:', input_ids_test[10])

HBox(children=(FloatProgress(value=0.0, max=14130.0), HTML(value='')))


Original:  0 a 7 dígitos ou versão mais atual que sejam horizontal ou verticalmente relacionadas às atividades objeto da operação nas quais pelo menos um dos integrantes do grupo detenha participação igual ou superior a 10 no capital social ou votante na resposta apresente também o organograma da estrutura societária das em oresas ue se em uadram nesse critério grupo micro focus em 2015 as seguintes empresas do grupo micro focus obtiveram receitas no brasil confidencial grupo hpe as empresas registradas no brasil do grupo hpe são as seguintes confidencial 11 11 no que diz respeito às empresas referidas nas respostas aos itens 11 5 e 11 10 forneça uma lista dos membros dos seus órgãos de gestão que sejam igualmente membros dos órgãos de gestão ou de fiscalização de quaisquer outras empresas atuantes nas mesmas atividades econômicas conforme cnae 2 0 a 7 dígitos indicando tais em resas 10 106 versão pública de acordo com as melhores informações disponíveis para as requerentes não existe

In [19]:
doc_chunk_dict_train = {}

for doc, chunk in zip(docs_train, input_ids_train):
  if doc not in doc_chunk_dict_train:
    doc_chunk_dict_train[doc] = [chunk]
  else:
    doc_chunk_dict_train[doc].append(chunk)

In [20]:
doc_chunk_dict_test = {}

for doc, chunk in zip(docs_test, input_ids_test):
  if doc not in doc_chunk_dict_test:
    doc_chunk_dict_test[doc] = [chunk]
  else:
    doc_chunk_dict_test[doc].append(chunk)

In [21]:
del word_index
del inv_word_index

del data_train_balanced_chunked
del data_test_chunked

del data_train_balanced_chunked_splitted
del data_test_chunked_splitted

del df_train_balanced_chunked_splitted
del df_test_chunked_splitted

del X_train_balanced
del X_test

del docs_train
del sentences_train

del docs_test
del sentences_test

In [22]:
import gc

In [23]:
gc.collect()

0

In [24]:
train_last_layer_embeddings = {}
train_all_layers_embeddings = {}
train_cls_token_embeddings = {}

bert_model.eval()
with torch.no_grad():
  for doc, sents in tqdm.notebook.tqdm(doc_chunk_dict_train.items()):
    for x in sents:
      r = bert_model(torch.tensor(x).unsqueeze(0).to(device))
      if doc not in train_last_layer_embeddings:
        train_last_layer_embeddings[doc] = torch.sum(r[0], dim=1).cpu()
        train_cls_token_embeddings[doc] = [r[1].cpu()]
        train_all_layers_embeddings[doc] = torch.sum(torch.stack(r[2]), dim=2).cpu()
      else:
        train_last_layer_embeddings[doc] += torch.sum(r[0], dim=1).cpu()
        train_cls_token_embeddings[doc].append(r[1].cpu())
        train_all_layers_embeddings[doc] += torch.sum(torch.stack(r[2]), dim=2).cpu()

HBox(children=(FloatProgress(value=0.0, max=888.0), HTML(value='')))

  if __name__ == '__main__':





In [25]:
with open('/content/drive/My Drive/Data Master/train_cls_token_embeddings_tail_fine_tuned_split' + str(str(n_split)) + '.pkl', 'wb') as file:
    pickle.dump(train_cls_token_embeddings, file)

with open('/content/drive/My Drive/Data Master/train_last_layer_embeddings_tail_fine_tuned_split' + str(str(n_split)) + '.pkl', 'wb') as file:
    pickle.dump(train_last_layer_embeddings, file)

with open('/content/drive/My Drive/Data Master/train_all_layers_embeddings_tail_fine_tuned_split' + str(str(n_split)) + '.pkl', 'wb') as file:
    pickle.dump(train_all_layers_embeddings, file)



In [26]:
test_last_layer_embeddings = {}
test_all_layers_embeddings = {}
test_cls_token_embeddings = {}

bert_model.eval()
with torch.no_grad():
  for doc, sents in tqdm.notebook.tqdm(doc_chunk_dict_test.items()):
    for x in sents:
      r = bert_model(torch.tensor(x).unsqueeze(0).to(device))
      if doc not in test_last_layer_embeddings:
        test_last_layer_embeddings[doc] = torch.sum(r[0], dim=1).cpu()
        test_cls_token_embeddings[doc] = [r[1].cpu()]
        test_all_layers_embeddings[doc] = torch.sum(torch.stack(r[2]), dim=2).cpu()
      else:
        test_last_layer_embeddings[doc] += torch.sum(r[0], dim=1).cpu()
        test_cls_token_embeddings[doc].append(r[1].cpu())
        test_all_layers_embeddings[doc] += torch.sum(torch.stack(r[2]), dim=2).cpu()

HBox(children=(FloatProgress(value=0.0, max=381.0), HTML(value='')))

  if __name__ == '__main__':





In [27]:
with open('/content/drive/My Drive/Data Master/test_cls_token_embeddings_tail_fine_tuned_split' + str(str(n_split)) + '.pkl', 'wb') as file:
    pickle.dump(test_cls_token_embeddings, file)

with open('/content/drive/My Drive/Data Master/test_last_layer_embeddings_tail_fine_tuned_split' + str(str(n_split)) + '.pkl', 'wb') as file:
    pickle.dump(test_last_layer_embeddings, file)

with open('/content/drive/My Drive/Data Master/test_all_layers_embeddings_tail_fine_tuned_split' + str(str(n_split)) + '.pkl', 'wb') as file:
    pickle.dump(test_all_layers_embeddings, file)



In [28]:
#TESTAR SEM FINE TUNING