In [71]:
import os
import glob
import logging
import warnings
import random

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

In [76]:
def load_sentences(filepath):
    """
    Given a file of raw sentences, return the list of these sentences.
    """
    # Load sentences from file.
    with open(filepath) as myfile:
        sentences = [line for line in myfile if line != '\n']
    
    # Only keep unique sentences.
    sentences = list(dict.fromkeys(sentences).keys())
    
    # Only keep sentences with less than 1300 char (bigger sentences are messed up).
    sentences = [sent for sent in sentences if len(sent) <= 1300]
    return sentences


def tokenize(tokenizer, sentences):
    """
    Given a list of sentences, convert words to vocab ids (0 -> 30522) in each sentence.
    """
    logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)  # No warning on sample size (I deal with that below).
    indexed_tokens = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]    
    return indexed_tokens


def create_chunks(sentences, indexed_tokens):
    """
    Given the tokenized sentences, return chunks of multiple sentences under 512 tokens.
    """
    token_chunks = [] 
    sentence_chunks = []

    # While list of tokenized sentences is not empty...
    while indexed_tokens:

        # Create the new chunk.
        chunk_tok = []
        chunk_sent = []

        # If:
        #  - The current chunk is empty;
        #  - There is still sentences to pop in the list;
        #  - The next sentence to pop is bigger than 512 tokens;
        if (not chunk_tok) and (indexed_tokens) and (len(indexed_tokens[0]) > 512):
            print("LONG SENTENCE")
            # Append the next sentence to the chunk (this chunk will be truncated later).
            chunk_tok.extend(indexed_tokens.pop(0))
            chunk_sent.append(sentences.pop(0))

        # While:
        #   - There is still sentences to pop in the list;
        #   - The length of the current chunk combined with the length of the next sentence to pop is lower than 512 tokens;
        while (indexed_tokens) and ((len(chunk_tok) + len(indexed_tokens[0])) <= 512):
            # Pop the next tokenized sentence and append it to the chunk.
            chunk_tok.extend(indexed_tokens.pop(0))
            chunk_sent.append(sentences.pop(0))

        # Concat all sentences in a chunk and remove \n.
        chunk_sent = ' '.join(chunk_sent).replace('\n', ' ')

        # Add that chunk to my list of chunks.
        token_chunks.append(chunk_tok)
        sentence_chunks.append(chunk_sent)
        
    return token_chunks, sentence_chunks


def pad_and_truncate_chunks(token_chunks):
    """
    Given a list of tokenized chunks, pad/truncate them so that they all have the same length.
    """
    # Define length of longest tokenized chunk in the batch.
    lengths = [len(i) for i in token_chunks]
    max_len = max(lengths) if max(lengths) <= 512 else 512

    # Pad/truncate chunks.
    padded_chunks = pad_sequences(token_chunks, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")
    return padded_chunks


def create_attention_masks(padded_chunks):
    """
    Given a list of tokenized padded chunks, create the attention masks for each of them.
    """
    attention_masks = np.where(padded_chunks != 0, 1, 0)  #returns ndarray which is 1 if padded_tokens != 0 is True and 0 if False.
    return attention_masks

In [77]:
filepath = '/raid/antoloui/Master-thesis/Data/Cleaned/dev.raw'
model_name_or_path = '/raid/antoloui/Master-thesis/Code/_models/netbert-830000/'


print("Loading pretrained model/tokenizer...")
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True) # Will output all hidden_states.
print("   Loaded checkpoint '{}'.".format(model_name_or_path))

print("Loading sentences from {}...".format(filepath))
sentences = load_sentences(filepath)
print("   {} sentences loaded.".format(len(sentences)))

print("Tokenizing sentences...")
sentences = sentences[:5000]
indexed_tokens = tokenize(tokenizer, sentences)
print("   {} sentences tokenized.".format(len(indexed_tokens)))

print("Creating chunks of max 512 tokens...")
token_chunks, sentence_chunks = create_chunks(sentences, indexed_tokens)
print("   {} chunks created.".format(len(token_chunks)))

print("Padding/truncating the chunks...")
padded_chunks = pad_and_truncate_chunks(token_chunks)
print("   {} chunks padded/truncated.".format(len(padded_chunks)))

print("Creating attention masks...")
attention_masks = create_attention_masks(padded_chunks)
print("   {} attention masks created.".format(len(attention_masks)))



Loading pretrained model/tokenizer...
   Loaded checkpoint '/raid/antoloui/Master-thesis/Code/_models/netbert-830000/'.
Loading sentences from /raid/antoloui/Master-thesis/Data/Cleaned/dev.raw...
   3753239 sentences loaded.
Tokenizing sentences...
   5000 sentences tokenized.
Creating chunks of max 512 tokens...
   395 chunks created.
Padding/truncating the chunks...
   395 chunks padded/truncated.
Creating attention masks...
   395 attention masks created.


In [78]:
def encode_chunks(model, device, sentence_chunks, padded_chunks, attention_masks, batch_size):
    """
    Encoding sentences with CPU/GPU(s).
    
    Note that here 'parallel.DataParallelModel' is used, where 'parallel.py' is a script imported
    from the ' PyTorch-Encoding' package: https://github.com/zhanghang1989/PyTorch-Encoding
    The DataParallelModel deals better with balanced load on multi-GPU than torch.nn.DataParallel,
    allowing to significantly increase the batch size per GPU.

    However, once again, the utilisation of the GPUs is very volatile (never at 100% all the time).
    """
    all_embeddings = []
    iterator = range(0, len(sentence_chunks), batch_size)
    for batch_idx in tqdm(iterator, desc="   Batches"):
        
        # Get the batch indices.
        batch_start = batch_idx
        batch_end = min(batch_start + batch_size, len(sentence_chunks))
        
        # Get the current batch.
        batch_input_ids = padded_chunks[batch_start:batch_end]
        batch_attention_masks = attention_masks[batch_start:batch_end]
        
        # Convert model inputs to torch tensors and push them to GPUs.
        batch_input_ids = torch.tensor(batch_input_ids)
        batch_input_ids = batch_input_ids.to(device)
        batch_attention_masks = torch.tensor(batch_attention_masks)
        batch_attention_masks = batch_attention_masks.to(device)
        
        # Encode batch.
        model.eval()
        with torch.no_grad():
            # outputs is a list of 3-tuples where each 3-tuple is such that:
            #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
            #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
            #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
            outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
            
        # Gather outputs from the different GPUs.
        #last_hidden_states = gather_sentence_outputs(outputs)
        last_hidden_states = outputs[0]

        # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
        chunk_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).to('cpu').numpy() for embeddings, masks in zip(last_hidden_states, batch_attention_masks)]
        all_embeddings.extend(chunk_embeddings)
        
    # Create dataframe for storing embeddings.
    all_embeddings = np.array(all_embeddings)
    cols = ['feat'+str(i+1) for i in range(all_embeddings.shape[1])]
    df = pd.DataFrame(data=all_embeddings[:,:], columns=cols)
    df['Chunk'] = sentence_chunks
    return df



device = torch.device("cpu")
model.to(device)





print("   Encoding chunks...")
df = encode_chunks(model, device, sentence_chunks, padded_chunks, attention_masks, 10)
print("   Chunks embedded.")

   Batches:   0%|          | 0/40 [00:00<?, ?it/s]

   Encoding chunks...


   Batches: 100%|██████████| 40/40 [01:28<00:00,  2.21s/it]

   Chunks embedded.





In [79]:
df

Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat760,feat761,feat762,feat763,feat764,feat765,feat766,feat767,feat768,Chunk
0,0.189363,-0.130048,-0.112261,0.030019,0.173977,0.029032,0.001942,0.033509,-0.021282,0.019294,...,-0.083900,-0.147276,-0.184282,0.047551,-0.018213,0.207356,0.142576,-0.043626,0.071253,"MediaSense Terminology • Playback, page 1 • Bl..."
1,0.102125,-0.168929,-0.130700,0.106593,0.133147,0.025501,-0.109736,0.001165,0.026999,0.031063,...,-0.075209,-0.152484,-0.166840,0.042063,0.035189,0.107075,0.162909,-0.121656,0.085183,For other sessions in Cisco MediaSense User Gu...
2,0.141492,-0.083923,-0.099955,0.138810,0.105259,-0.122188,-0.147850,0.074596,0.038700,0.043998,...,-0.059514,-0.250270,-0.222964,0.072229,-0.070169,0.114454,0.193807,-0.145606,-0.053496,A session can be live (active) or recorded (co...
3,0.129332,-0.059174,-0.092327,0.133588,0.121212,-0.090347,-0.137022,0.052293,0.016732,0.024999,...,-0.046281,-0.223892,-0.290861,0.114154,-0.072899,0.082204,0.179868,-0.128829,-0.028522,Each instance corresponds directly to one inst...
4,0.069944,-0.123942,-0.151792,0.072241,0.116116,-0.056365,-0.179424,0.034179,-0.043523,0.097471,...,-0.045592,-0.224050,-0.186254,0.059414,-0.038229,0.051380,0.153363,-0.141080,-0.024632,The data is load balanced between both servers...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0.144772,-0.075381,-0.106285,0.138673,-0.012263,0.119993,-0.005872,-0.054749,0.172315,0.092370,...,-0.009337,-0.123435,-0.210681,0.011843,-0.149880,-0.058068,0.211883,-0.161663,0.063657,serial number (Optional) Displays information ...
391,0.235946,-0.121318,-0.150308,0.183412,-0.015245,0.063203,0.160002,0.066461,0.153953,-0.161887,...,0.052945,0.063293,-0.397960,0.056324,-0.165852,-0.091495,0.309324,-0.170412,0.033740,Because a specific bundle or bundle link is no...
392,0.160374,-0.069715,0.030420,0.188667,0.031155,0.043820,0.146876,0.003039,0.073386,-0.022806,...,0.026316,-0.032932,-0.384746,-0.117773,-0.137363,-0.209062,0.351914,-0.035939,0.085186,"The example shows a bundle link in the ""idle"" ..."
393,0.150990,-0.096720,-0.082435,0.216195,0.045443,0.042745,0.136720,0.010465,0.056302,0.075066,...,0.027702,-0.071019,-0.340958,-0.040792,-0.112066,-0.176844,0.282540,-0.118444,0.023574,"The example shows a bundle link in the ""up"" st..."


### Gather all .h5 files

In [3]:
dirpath = '/raid/antoloui/Master-thesis/Data/Embeddings/'
filename = 'dev00.h5'

# Open dataframe.
df = pd.read_hdf(dirpath + filename)
df

Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat760,feat761,feat762,feat763,feat764,feat765,feat766,feat767,feat768,Sentence
0,-0.280264,-0.099757,-0.038239,-0.098673,0.169518,-0.063823,0.735144,-0.032743,0.358951,0.097830,...,-0.058485,-0.256885,-0.130044,0.059638,-0.421342,-0.622948,-0.036204,0.172045,0.383999,"G0, March 2007 Contents Features and Enhanceme..."
1,-0.366134,-0.208020,0.016205,-0.186076,0.144419,-0.004993,0.786685,-0.101411,0.297584,0.073194,...,-0.136041,-0.165522,-0.103805,0.129898,-0.474089,-0.556440,0.010095,0.145695,0.362004,p.#3# Contents Contents#.........................
2,-0.002002,-0.106297,-0.087648,0.122653,0.069144,-0.000705,0.156554,0.290996,-0.123224,0.104411,...,-0.222313,0.156713,-0.232677,0.059451,-0.381720,-0.044734,0.488280,-0.029846,0.007652,Home Skip to content Skip to footer Worldwide ...
3,-0.062134,0.096518,-0.215182,0.196923,0.215821,-0.026429,0.116073,0.154525,0.014295,0.008637,...,-0.110915,-0.049860,-0.352348,0.200215,-0.231690,-0.041897,0.323917,0.086202,0.141788,</p> <p> The LocationStatus notification is ge...
4,-0.011293,0.080487,-0.148380,0.105190,0.015003,-0.095529,0.086479,0.201846,0.065185,0.065424,...,-0.223511,-0.035504,-0.419971,0.071184,-0.241609,0.055943,0.262380,0.064603,0.171011,"<?xml version=""1.0"" encoding=""UTF-8""?> <comman..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,-0.193245,0.434848,0.811894,-0.410603,-0.315942,-0.935870,0.051716,-0.511334,0.347976,-1.574084,...,0.662778,0.453332,-0.555245,0.254290,0.050753,-0.323471,-0.621548,-0.529996,-0.624799,\n
999996,-0.193245,0.434848,0.811894,-0.410603,-0.315942,-0.935870,0.051716,-0.511334,0.347976,-1.574084,...,0.662778,0.453332,-0.555245,0.254290,0.050753,-0.323471,-0.621548,-0.529996,-0.624799,\n
999997,-0.193245,0.434848,0.811894,-0.410603,-0.315942,-0.935870,0.051716,-0.511334,0.347976,-1.574084,...,0.662778,0.453332,-0.555245,0.254290,0.050753,-0.323471,-0.621548,-0.529996,-0.624799,\n
999998,-0.193245,0.434848,0.811894,-0.410603,-0.315942,-0.935870,0.051716,-0.511334,0.347976,-1.574084,...,0.662778,0.453332,-0.555245,0.254290,0.050753,-0.323471,-0.621548,-0.529996,-0.624799,\n


In [11]:
print("\nNumber of duplicated rows: {}. Only keeping one sample for each duplicate...".format(df[df.duplicated(['Sentence'])].shape[0]))
print("\nNumber of duplicated rows: {}. Only keeping one sample for each duplicate...".format(df[df.duplicated()].shape[0]))
#original_df.drop_duplicates(subset=['Sentence'], keep='first', inplace=True)


Number of duplicated rows: 304792. Only keeping one sample for each duplicate...

Number of duplicated rows: 237604. Only keeping one sample for each duplicate...
