# Visualize queries embeddings.
Encode all queries that were correclty predicted with NetBERT and plot them with tsne.

In [1]:
import os
import json

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


### Load queries correctly classified by NetBERT

In [2]:
def load_sentences(dirpath):
    """
    """
    with open(os.path.join(dirpath, 'map_classes.json')) as f:
        class_mappings = json.load(f)
    
    # Create dataframe.
    df = pd.read_csv(os.path.join(dirpath, 'preds_right.csv'), index_col=0)

    # Create columns with classes.
    df['Class'] = df.apply(lambda row: class_mappings[str(row.Class_id)], axis=1)

    # Drop prediction.
    df.drop(['Prediction_id'], axis=1, inplace=True)
    return df


df = load_sentences('./output/netbert-1880000/')
df

Unnamed: 0,Sentence,Class_id,Class
0,Steps in using cisco wsa,4,End User Guides
1,compatible version between ASA 5520 and ASDM,1,Install & Upgrade Guides
2,nexus 5000 copp,3,Release Notes
3,4500-X netflow multiple exporter,0,"Configuration (Guides, Examples & TechNotes)"
4,CISCO WSA AsyncOS API,4,End User Guides
...,...,...,...
761,catalyst 9500 license activation,2,Data Sheets
762,iosxe release schedule,2,Data Sheets
763,NXOS train tracker,3,Release Notes
764,nxos n5k HA support,3,Release Notes


### Encode queries with NetBERT

In [5]:
def encode_sentences(model_name_or_path, batch_size, df):
    """
    Encode corpus of sentences with CPU or GPU(s).
    
    Note that multi-GPU encoding is quite imbalanced due to the use
    of 'torch.nn.DataParallel' which loads the model in the main GPU
    but also gathers the output of all other GPUs back to the main one.
    As a result, the main GPU is three times more loaded than the others.
    Although the other GPUs are not fully loaded, one can not increase the
    batch size as it would result in an 'out of memory' error in the main GPU.
    Here, the max batch size is 128.
    
    Note that the GPU utilisation with 'torch.nn.DataParallel' is very volatile.
    GPUs are never running at 100%, which slows the process.
    """
    print("   Loading pretrained model/tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True) # Will output all hidden_states.
    
    print("   Setting up CUDA & GPU...")
    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    print("   Encoding sentences...")
    all_embeddings = []
    sentences = df.Sentence.values
    iterator = range(0, len(sentences), batch_size)
    for batch_idx in tqdm(iterator, desc="Batches"):
        
        # Get the batch.
        batch_start = batch_idx
        batch_end = min(batch_start + batch_size, len(sentences))
        batch_sentences = sentences[batch_start:batch_end]
        
        # Tokenize each sentence of the batch.
        tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_sentences]
        
        # Pad/Truncate sentences to max_len or 512.
        lengths = [len(i) for i in tokenized]
        max_len = max(lengths) if max(lengths) <= 512 else 512
        padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                          value=0, truncating="post", padding="post")
        
        # Create attention masks.
        attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.
        
        # Convert inputs to torch tensors.
        input_ids = torch.tensor(padded)
        attention_mask = torch.tensor(attention_mask)
        
        # Push inputs to GPUs.
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Encode batch.
        model.eval()
        with torch.no_grad():
            # output is a 2-tuple where:
            #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
            #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
            #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
            output = model(input_ids, attention_mask=attention_mask)
        
        # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
        last_hidden_states = output[0]
        sentence_embeddings = [torch.mean(embeddings, dim=0).to('cpu').numpy() for embeddings in last_hidden_states]
        all_embeddings.extend(sentence_embeddings)
    
    # Create dataframe for storing embeddings.
    all_embeddings = np.array(all_embeddings)
    cols = ['feat'+str(i+1) for i in range(all_embeddings.shape[1])]
    df_embeddings = pd.DataFrame(data=all_embeddings[:,:], columns=cols)
    df_embeddings['Sentence'] = sentences
    df_embeddings['Class'] = df.Class.values
    df_embeddings['Class_id'] = df.Class_id.values
    return df_embeddings
    


df_embeddings = encode_sentences(model_name_or_path='./output/netbert-1880000/', 
                                      batch_size=128,
                                      df=df)
df_embeddings

   Loading pretrained model/tokenizer...
   Setting up CUDA & GPU...


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

   Encoding sentences...


Batches: 100%|██████████| 6/6 [00:01<00:00,  4.45it/s]


Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat762,feat763,feat764,feat765,feat766,feat767,feat768,Sentence,Class,Class_id
0,0.576330,-0.351257,-0.091710,-0.219648,0.378146,0.527342,0.220837,-0.089846,0.260360,1.182676,...,-0.528551,0.518903,0.022337,-0.044169,-0.128965,0.233406,-0.205971,Steps in using cisco wsa,End User Guides,4
1,0.561319,0.177696,-0.112206,0.425161,-0.429158,-0.379727,-0.160919,-0.510925,0.219502,-0.550554,...,-0.276077,0.200169,0.092698,0.021721,-0.168628,0.137702,0.262467,compatible version between ASA 5520 and ASDM,Install & Upgrade Guides,1
2,0.245346,0.781597,-0.295042,0.247582,-0.829814,0.068325,-0.188615,-0.174411,0.638154,0.089101,...,-0.346690,-0.284770,-0.120352,0.265443,0.500597,-0.070193,0.230085,nexus 5000 copp,Release Notes,3
3,0.011531,-0.584435,-0.188880,-0.016323,-0.007238,0.227317,0.222591,0.078012,-0.181979,-0.256893,...,-0.791832,-0.321341,-0.560362,0.317248,0.337226,-0.209507,-0.044955,4500-X netflow multiple exporter,"Configuration (Guides, Examples & TechNotes)",0
4,0.574695,-0.102464,-0.120590,-0.103518,0.178213,-0.287618,0.067027,-0.195530,-0.165063,0.318972,...,-1.020288,0.104891,-0.374797,0.100937,-0.207902,0.119452,0.252381,CISCO WSA AsyncOS API,End User Guides,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761,-0.242718,0.145647,0.081098,0.155190,-0.630289,0.192384,0.104437,-0.038133,0.070794,-0.729530,...,-0.170812,-0.075165,-0.176934,0.097652,0.163313,-0.227132,-0.000500,catalyst 9500 license activation,Data Sheets,2
762,0.456007,-0.375779,-0.824089,-0.057569,0.065809,-0.103723,0.310248,-0.334891,0.172832,-0.107732,...,-0.174445,-0.284098,-0.526760,0.292875,0.640222,-0.315411,0.579054,iosxe release schedule,Data Sheets,2
763,0.108068,0.107516,-0.058394,0.006578,-0.250544,-0.020994,0.134816,-0.244861,0.609560,0.166933,...,-0.365316,-0.102242,-0.758688,0.182289,0.322008,0.081552,0.456008,NXOS train tracker,Release Notes,3
764,0.036145,0.697235,0.045144,0.386749,-0.598107,-0.470866,-0.239238,-0.307143,0.801774,-0.151745,...,0.173098,-0.070351,-0.302839,-0.092270,0.094244,0.265572,0.152955,nxos n5k HA support,Release Notes,3


### Perform tsne

In [None]:
tsne = TSNE(n_components=3, perplexity=5, n_iter=2000, learning_rate = 0.01, random_state = 42, verbose=1)
tsne_results = tsne.fit_transform(sentence_embeddings)

tsne_df = df.copy(deep=True)
tsne_df['tsne-one'] = tsne_results[:,0]
tsne_df['tsne-two'] = tsne_results[:,1]
tsne_df['tsne-three'] = tsne_results[:,2]

# PCA in 2D
fig = plt.figure(figsize=(25,8))
ax1 = fig.add_subplot(1, 2, 1)
sns.scatterplot(
    x = tsne_df['tsne-one'], 
    y = tsne_df['tsne-two'],
    hue = tsne_df["Label"],
    palette = "coolwarm")
ax1.set_title("2D t-SNE")
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles=handles[1:], labels=labels[1:], loc='upper right', title="")

# PCA in 3D
ax2 = fig.add_subplot(1, 2, 2, projection='3d')
scatter = ax2.scatter(
    xs=tsne_df["tsne-one"], 
    ys=tsne_df["tsne-two"], 
    zs=tsne_df["tsne-three"],
    c=pca_df["Label_id"],
    cmap='coolwarm')
handles,_ = scatter.legend_elements(num=len(classes))
ax2.legend(handles, classes, loc="center left", title="", bbox_to_anchor=(1, 0.68))
ax2.set_title("3D t-SNE")
ax2.set_xlabel('tsne-one')
ax2.set_ylabel('tsne-two')
ax2.set_zlabel('tsne-three')
plt.show()