In [1]:
import os
import time
import datetime
import argparse
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

import parallel

Using TensorFlow backend.


In [2]:
def gather_sentence_outputs(outputs):
    """
    'outputs' is a list of 'output' of each GPU. As a reminder, each 'output' is a 3-tuple where:
        - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
    """
    # Extract the last_hidden_state in each GPU output ('gathered' is a list of nb_gpu x torch tensors)
    gathered = [output[0] for output in outputs]
    
    # Concatenate the samples for that batch.
    gathered = torch.cat(gathered, dim=0)
    
    # Detach output and return.
    return gathered


In [3]:
sentences = ['Sentence A',
             'Sentence B',
             'Sentence C',
             'Sentence D',
             'Sentence E',
             'Sentence F',
             'Sentence G',
             'Sentence H',
             'Sentence I',
             'Sentence J',
             'Sentence K',
             'Sentence L',
             'Sentence M',
             'Sentence N',
             'Sentence O',
             'Sentence P',
             'Sentence Q',
             'Sentence R',
             'Sentence S',
             'Sentence T',
             'Sentence V',
             'Sentence W',
             'Sentence X',
             'Sentence Y',
             'Sentence AA',
             'Sentence AB',
             'Sentence AC',
             'Sentence AD',
             'Sentence AE',
             'Sentence AF',
             'Sentence AG',
             'Sentence AH'
]

In [4]:
model_name_or_path = '/raid/antoloui/Master-thesis/Code/_models/netbert/checkpoint-1027000/'
cache_dir = '/raid/antoloui/Master-thesis/Code/_cache'
batch_size = 16

In [6]:
# Create dataframe for storing embeddings.
cols = ['feat'+str(i+1) for i in range(768)]
para_df = pd.DataFrame(columns=cols)
para_df['Sentence'] = None

print("   Loading pretrained model/tokenizer...")
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir=cache_dir) # Will output all hidden_states.

print("   Setting up CUDA & GPU...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
if n_gpu > 1:
    parallel_model = parallel.DataParallelModel(model)
parallel_model.to(device)

print("   Encoding sentences...")
all_embeddings = []
iterator = range(0, len(sentences), batch_size)
for batch_idx in tqdm(iterator, desc="Batches"):

    # Get the batch.
    batch_start = batch_idx
    batch_end = min(batch_start + batch_size, len(sentences))
    batch_sentences = sentences[batch_start:batch_end]

    # Tokenize each sentence of the batch.
    tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_sentences]

    # Pad/Truncate sentences to max_len or 512.
    lengths = [len(i) for i in tokenized]
    max_len = max(lengths) if max(lengths) <= 512 else 512
    padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                      value=0, truncating="post", padding="post")

    # Create attention masks.
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    # Convert inputs to torch tensors.
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)

    # Push inputs to GPUs.
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Encode batch.
    parallel_model.eval()
    with torch.no_grad():
        # output is a 3-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        outputs = parallel_model(input_ids, attention_mask=attention_mask)
    
    # Gather outputs from the different GPUs.
    last_hidden_states = gather_sentence_outputs(outputs)
    
    # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
    sentence_embeddings = [torch.mean(embeddings, dim=0).detach().cpu().numpy() for embeddings in last_hidden_states]
    all_embeddings.extend(sentence_embeddings)

# Create dataframe for storing embeddings.
all_embeddings = np.array(all_embeddings)
cols = ['feat'+str(i+1) for i in range(all_embeddings.shape[1])]
df = pd.DataFrame(data=all_embeddings[:,:], columns=cols)
df['Sentence'] = sentences
    
df

   Loading pretrained model/tokenizer...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

   Setting up CUDA & GPU...
   Encoding sentences...


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.84it/s]


Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat760,feat761,feat762,feat763,feat764,feat765,feat766,feat767,feat768,Sentence
0,0.389149,-0.467823,0.077645,0.141954,0.224419,0.139879,-0.076673,-0.060516,0.062125,0.022871,...,-0.646416,0.227659,-0.490203,0.2459,-0.254128,-0.329543,-0.198598,0.068783,-0.538478,Sentence A
1,0.60545,-0.416389,0.035739,0.161286,0.079505,0.15349,-0.089173,-0.087885,0.130674,0.088896,...,-0.69629,0.22214,-0.246897,0.239287,-0.223381,-0.527204,-0.495953,-0.030531,-0.367584,Sentence B
2,0.495418,-0.219121,0.252079,0.078404,0.184926,0.197358,-0.071906,-0.12263,0.19881,0.001121,...,-0.275513,0.32725,-0.280197,0.047378,-0.292781,-0.559186,-0.083412,0.075407,-0.064976,Sentence C
3,0.509843,-0.313011,0.216288,-0.072503,0.211217,0.248954,-0.00374,-0.154323,0.109088,-0.049836,...,-0.351768,0.257652,-0.344908,0.03457,-0.268906,-0.442429,0.050891,0.123586,0.056281,Sentence D
4,0.498513,-0.35673,0.376235,0.077485,0.127476,0.130756,0.080312,-0.396394,0.41066,-0.138164,...,-0.473753,0.276787,-0.270568,0.048371,-0.110058,-0.536626,-0.281386,-0.054994,-0.140219,Sentence E
5,0.426011,-0.306839,0.262155,0.094843,0.363476,0.275041,-0.149319,-0.352509,0.165517,0.018678,...,-0.211299,0.349573,-0.263002,0.36064,-0.148951,-0.521233,-0.054455,0.024065,-0.16857,Sentence F
6,0.490857,-0.439864,0.281246,-0.016329,-0.237987,0.22547,-0.048092,-0.124546,0.447295,-0.270104,...,-0.53663,0.0962,-0.393495,0.390552,-0.208142,-0.342205,-0.138173,-0.15496,-0.198558,Sentence G
7,0.358441,-0.316574,0.176692,0.010867,-0.169676,0.226439,-0.039903,-0.204183,0.305389,-0.19704,...,-0.393786,0.279771,-0.441534,0.397837,-0.12032,-0.213153,-0.161117,-0.053637,-0.215076,Sentence H
8,0.847991,-0.315709,0.263017,-0.042864,0.136913,0.118904,0.132895,-0.154171,0.3668,-0.022292,...,-0.377282,0.213572,-0.114491,0.035627,-0.094321,-0.286962,-0.265997,-0.140985,-0.032772,Sentence I
9,0.377682,-0.327995,0.158554,-0.052308,0.18125,0.204611,-0.282909,-0.200156,0.116396,-0.062238,...,-0.438661,0.270239,-0.27651,0.196788,0.008928,-0.32595,0.03837,-0.069071,-0.079267,Sentence J


In [None]:
    
    
    
    
    # Append batch dataframe to full dataframe.
    batch_df = pd.DataFrame(data=sentence_embeddings[:,:], columns=cols)
    batch_df['Sentence'] = batch_sentences
    para_df = pd.concat([para_df, batch_df], axis=0, ignore_index=True)
   
para_df

In [100]:
print("   Loading pretrained model/tokenizer...")
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir ='../_cache') # Will output all hidden_states.

print("   Tokenizing sentences...")
tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

lengths = [len(i) for i in tokenized]
max_len = max(lengths) if max(lengths) <= 512 else 512

print("   Padding/Truncating sentences to {} tokens...".format(max_len))
padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                      value=0, truncating="post", padding="post")

print("   Creating attention masks...")
attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

print("   Converting inputs to torch tensors...")
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

print("   Encoding sentences...")
with torch.no_grad():
    # output is a 2-tuple where:
    #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
    #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
    #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
    output = model(input_ids, attention_mask=attention_mask)

# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
hidden_states = torch.stack(output[2], dim=0)

# Switch around the “layers” and “tokens” dimensions with permute.
hidden_states = hidden_states.permute(1,2,0,3)

# For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
last_hidden_states = output[0]
sentence_embeddings = [torch.mean(embeddings, dim=0).numpy() for embeddings in last_hidden_states]
sentence_embeddings = np.array(sentence_embeddings)

# Create pandas dataframe.
cols = ['feat'+str(i+1) for i in range(sentence_embeddings.shape[1])]
df = pd.DataFrame(data=sentence_embeddings[:,:], columns=cols)
df['Sentence'] = sentences


df

   Loading pretrained model/tokenizer...
   Tokenizing sentences...
   Padding/Truncating sentences to 7 tokens...
   Creating attention masks...
   Converting inputs to torch tensors...
   Encoding sentences...


Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat760,feat761,feat762,feat763,feat764,feat765,feat766,feat767,feat768,Sentence
0,0.389301,-0.436019,0.104365,0.121289,0.188905,0.170679,-0.089029,-0.070443,0.005561,0.061138,...,-0.557408,0.190559,-0.479769,0.216893,-0.220323,-0.265203,-0.177401,0.01937,-0.490847,Sentence A
1,0.588807,-0.397312,0.088026,0.154662,0.063029,0.215281,-0.110269,-0.116416,0.056288,0.096898,...,-0.621771,0.212321,-0.279812,0.194988,-0.226817,-0.457181,-0.445315,-0.081701,-0.308892,Sentence B
2,0.508777,-0.204457,0.25098,0.104119,0.210347,0.219988,-0.099427,-0.123625,0.159593,0.016007,...,-0.216444,0.282336,-0.317339,0.028035,-0.286026,-0.541283,-0.099515,0.059224,-0.05674,Sentence C
3,0.506406,-0.291443,0.21587,-0.048629,0.197171,0.278655,-0.020547,-0.167075,0.068368,-0.027406,...,-0.305267,0.247168,-0.345752,0.02769,-0.257821,-0.409944,0.021371,0.103835,0.057571,Sentence D
4,0.51814,-0.304466,0.414437,0.076771,0.136925,0.208264,0.030396,-0.406043,0.318225,-0.093752,...,-0.423262,0.276826,-0.296738,0.020474,-0.194528,-0.507969,-0.228096,-0.107263,-0.110033,Sentence E
5,0.424719,-0.26692,0.30175,0.05919,0.363266,0.313698,-0.127692,-0.353682,0.09461,0.043129,...,-0.181358,0.321124,-0.276037,0.309137,-0.190172,-0.514129,-0.035646,-0.009217,-0.151043,Sentence F
6,0.503946,-0.405612,0.294951,-0.023809,-0.186182,0.278337,-0.064375,-0.14296,0.379293,-0.206508,...,-0.469901,0.100055,-0.385203,0.334581,-0.222636,-0.335565,-0.130577,-0.163709,-0.161335,Sentence G
7,0.358563,-0.253686,0.205862,-0.00646,-0.124346,0.256896,-0.038116,-0.217732,0.233963,-0.149307,...,-0.333689,0.255893,-0.411833,0.366815,-0.141005,-0.225257,-0.153168,-0.073373,-0.191348,Sentence H
8,0.831699,-0.258146,0.28773,-0.032586,0.12524,0.186883,0.081654,-0.177236,0.27584,0.009605,...,-0.339251,0.22708,-0.163213,0.026457,-0.144556,-0.266636,-0.225908,-0.173875,-0.013827,Sentence I
9,0.392798,-0.291454,0.167077,-0.031606,0.165943,0.255301,-0.281478,-0.209891,0.053821,-0.047746,...,-0.372261,0.246777,-0.311192,0.170602,-0.015931,-0.29762,0.005153,-0.085357,-0.06328,Sentence J
