# Visualize queries embeddings.
Encode all queries that were correclty predicted with NetBERT and plot them with tsne.

In [7]:
import os
import json

import pandas as pd
import numpy as np

### Load queries correctly classified by NetBERT

In [11]:
def load_sentences(dirpath):
    """
    """
    with open(os.path.join(dirpath, 'map_classes.json')) as f:
        class_mappings = json.load(f)
    
    # Create dataframe.
    df = pd.read_csv(os.path.join(dirpath, 'preds_right.csv'), index_col=0)

    # Create columns with classes.
    df['Class'] = df.apply(lambda row: class_mappings[str(row.Class_id)], axis=1)

    # Drop prediction.
    df.drop(['Prediction_id'], axis=1, inplace=True)
    return df


df = load_sentences('./output/netbert-1880000/')
sentences = df.Sentence.values
df

Unnamed: 0,Sentence,Class_id,Class
0,Steps in using cisco wsa,4,End User Guides
1,compatible version between ASA 5520 and ASDM,1,Install & Upgrade Guides
2,nexus 5000 copp,3,Release Notes
3,4500-X netflow multiple exporter,0,"Configuration (Guides, Examples & TechNotes)"
4,CISCO WSA AsyncOS API,4,End User Guides
...,...,...,...
769,DNAC App Policy,4,End User Guides
770,Catalyst 2960-X 48 GigE PoE 740W 4 x 1G SFP LA...,2,Data Sheets
771,NXOS train tracker,3,Release Notes
772,nxos n5k HA support,3,Release Notes


### Encode queries with NetBERT

In [None]:
def encode_sentences(model_name_or_path, batch_size, sentences):
    """
    Encode corpus of sentences with CPU or GPU(s).
    
    Note that multi-GPU encoding is quite imbalanced due to the use
    of 'torch.nn.DataParallel' which loads the model in the main GPU
    but also gathers the output of all other GPUs back to the main one.
    As a result, the main GPU is three times more loaded than the others.
    Although the other GPUs are not fully loaded, one can not increase the
    batch size as it would result in an 'out of memory' error in the main GPU.
    Here, the max batch size is 128.
    
    Note that the GPU utilisation with 'torch.nn.DataParallel' is very volatile.
    GPUs are never running at 100%, which slows the process.
    """
    print("   Loading pretrained model/tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True) # Will output all hidden_states.
    
    print("   Setting up CUDA & GPU...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        gpu_ids = list(range(0, n_gpu))
        model = torch.nn.DataParallel(model, device_ids=gpu_ids, output_device=gpu_ids[-1])
    model.to(device)
    
    print("   Encoding sentences...")
    all_embeddings = []
    iterator = range(0, len(sentences), batch_size)
    for batch_idx in tqdm(iterator, desc="Batches"):
        
        # Get the batch.
        batch_start = batch_idx
        batch_end = min(batch_start + batch_size, len(sentences))
        batch_sentences = sentences[batch_start:batch_end]
        
        # Tokenize each sentence of the batch.
        tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_sentences]
        
        # Pad/Truncate sentences to max_len or 512.
        lengths = [len(i) for i in tokenized]
        max_len = max(lengths) if max(lengths) <= 512 else 512
        padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                          value=0, truncating="post", padding="post")
        
        # Create attention masks.
        attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.
        
        # Convert inputs to torch tensors.
        input_ids = torch.tensor(padded)
        attention_mask = torch.tensor(attention_mask)
        
        # Push inputs to GPUs.
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Encode batch.
        model.eval()
        with torch.no_grad():
            # output is a 2-tuple where:
            #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
            #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
            #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
            output = model(input_ids, attention_mask=attention_mask)
        
        # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
        last_hidden_states = output[0]
        sentence_embeddings = [torch.mean(embeddings, dim=0).to('cpu').numpy() for embeddings in last_hidden_states]
        all_embeddings.extend(sentence_embeddings)
    
    # Create dataframe for storing embeddings.
    all_embeddings = np.array(all_embeddings)
    cols = ['feat'+str(i+1) for i in range(all_embeddings.shape[1])]
    df = pd.DataFrame(data=all_embeddings[:,:], columns=cols)
    df['Sentence'] = sentences
    return df
    


df_embeddings = encode_sentences(model_name_or_path='./output/netbert-1880000/', 
                                      batch_size=128,
                                      sentences=sentences)