# FAISS
Facebook AI Similarity Search (FAISS) library, which has excellent GPU implementation of "brute-force" kNN (meaning that no approximation techniques compromising the accuracy of the search).

In [153]:
import sys
import time
import re
import string

import pandas as pd
import numpy as np

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

import faiss
from nltk.corpus import stopwords


def load_chunks(filepath):
    """
    Load the chunk embeddings of the text file.
    """
    # Load dataframe.
    df = pd.read_hdf(filepath)
    
    # Get chunks and their embeddings.
    chunks = df.iloc[:,-1].values
    embeddings = df.iloc[:,:-1].values
    embeddings = np.ascontiguousarray(embeddings, dtype=np.float32) # Necessary for FAISS indexing afterwards.
    
    return chunks, embeddings


def create_gpu_index(vecs, n_gpus, method):
    """
    Create FAISS index on GPU(s).
    To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. 
    Note that a "flat" index means that it is brute-force, with no approximation techniques.
    """
    # Build flat CPU index.
    if method=='l2':
        cpu_index = faiss.IndexFlatL2(vecs.shape[1])  # Exact Search for L2
    elif method=='ip':
        cpu_index = faiss.IndexFlatIP(vecs.shape[1])  # Exact Search for Inner Product (also for cosine, just normalize vectors beforehand)
    else:
        print("Error: Please choose between L2 distance ('l2') or Inner Product ('ip') as brute-force method for exact search. Exiting...")
        sys.exit(0)
    
    # Convert to flat GPU index.
    co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index,co=co, ngpu=n_gpus)  # Convert CPU index to GPU index.
    
    # Add vectors to GPU index.
    gpu_index.add(vecs)

    return gpu_index


def load_questions(filepath, questions_type):
    """
    """
    # Load dataframe.
    df = pd.read_csv(filepath, sep=";")

    if questions_type is None:
        # Get all questions and their associated answers.
        questions = df.Question.values
        answers = df.Answer.values
    else:
        # Get questions of given type and associated answers.
        questions = df[df['QuestionType'] == questions_type].Question.values
        answers = df[df['QuestionType'] == questions_type].Answer.values
    
    return questions, answers


def encode_sentences(model_name_or_path, cache, sentences):
    """
    Given a list of sentences and a model, get the embeddings of theses sentences
    as the average of the word embeddings of the last layer.
    """
    print("   Loading pretrained model/tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir=cache) # Will output all hidden_states.

    print("   Tokenizing sentences...")
    tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

    lengths = [len(i) for i in tokenized]
    max_len = max(lengths) if max(lengths) <= 512 else 512

    print("   Padding/Truncating sentences to {} tokens...".format(max_len))
    padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                          value=0, truncating="post", padding="post")

    print("   Creating attention masks...")
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    print("   Converting inputs to torch tensors...")
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    print("   Encoding sentences...")
    with torch.no_grad():
        # output is a 2-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        output = model(input_ids, attention_mask=attention_mask)

    # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
    last_hidden_states = output[0]
    sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_mask)]
    sentence_embeddings = np.array(sentence_embeddings)
    
    return sentence_embeddings


def remove_punctuation(text):
    """
    Remove common punctuations.
    """
    return re.sub('([.,:;!?{}()])', r'', text)


def process_text(text):
    """
    """
    # Remove punctuations.
    processed = remove_punctuation(text)
    
    # Lower text.
    processed = processed.lower()
    
    # Remove stopwords.
    processed = [w for w in processed.split() if w not in stopwords.words('english')]
    return processed


def compute_score(chunk, answer):
    """
    """
    # Get all words from answer present in the chunk.
    words = [w for w in answer if w in chunk]
    
    # Define the score as the percentage of words from answer present in the chunk.
    score = (len(words)/len(answer))*100
    return score


def run(model_name_or_path, embeddings_filepath, questions_filepath, questions_type=None,
        cache='/raid/antoloui/Master-thesis/Code/_cache/', method='l2', n_gpus=1, topk=10):
    """
    """
    print("\nLoad chunks and their embeddings...") 
    chunks, embeddings = load_chunks(embeddings_filepath)
    
    print("\nCreate FAISS (GPU) index...")
    index = create_gpu_index(vecs=embeddings, 
                             n_gpus=n_gpus, 
                             method=method)
    
    print("\nLoad questions...")
    questions, answers = load_questions(questions_filepath, questions_type)
    
    print("\nEncode questions...")
    quest_embeddings = encode_sentences(model_name_or_path=model_name_or_path, 
                                             cache=cache,
                                             sentences=questions)
    
    print("\nPerform evaluation...")
    # Create results dataframe.
    cols = ['Question', 'Answer', 'Search_score', 'Top_result_idx']
    additional_cols = ['result'+str(i+1) for i in range(topk)]
    cols.extend(additional_cols)
    additional_cols = ['result'+str(i+1)+'_L2dist' for i in range(topk)]
    cols.extend(additional_cols)
    df = pd.DataFrame(columns=cols)
    
    # For each question-answer pair, compute a score according to the results of FAISS search.
    for i, (Q, E, A) in enumerate(zip(questions, quest_embeddings, answers)):
        
        print("\nQUESTION {}: '{}'".format(i, Q))
        print("ANSWER: '{}'".format(A))
        
        # Process the answer.
        multiple_ans = A.split(';')
        processed_ans = [process_text(ans) for ans in multiple_ans]

        # Find topk chunks with FAISS search.
        result_dist, result_idx = index.search(E.reshape(1,768), k=topk)
        
        # Get these chunks and their L2 distances.
        result_chunks = [chunks[idx] for idx in result_idx[0]]
        result_distances = result_dist[0]

        # For each result chunk, compute a score. 
        # The score si defined as the percentage of words in the answer that appears in that chunk. 
        # If multiple answers are possible, the max score is taken.
        scores = []
        for k, (chunk,dist) in enumerate(zip(result_chunks,result_distances)):

            # Get the chunk and process it.
            processed_chunk = process_text(chunk)

            # Compute a score for that chunk according to each answer.
            scores_by_ans = [compute_score(processed_chunk, ans) for ans in processed_ans]

            # Take the maximum score out of the possible answers.
            chunk_score = max(scores_by_ans)

            # Append it to the scores list for topk chunks.
            scores.append(chunk_score)

        # Get the final score as the max score among all chunk results, and the associated chunk id.
        max_score = max(scores)
        top_chunk = scores.index(max(scores))
        print("  SCORE: '{:.2f}'  -  Chunk: {}".format(max_score, top_chunk))
        
        # Add row to dataframe.
        new_row = [Q, A, max_score, top_chunk+1]
        new_row.extend(result_chunks)
        new_row.extend(result_distances)
        df = df.append(pd.Series(new_row, index=df.columns), ignore_index=True)
        
    print("\nPlot results...")
    return df
    
    print("\nDone.")

## NetBERT

In [154]:
df = run(
    model_name_or_path='/raid/antoloui/Master-thesis/Code/_models/netbert-830000/',
    embeddings_filepath='/raid/antoloui/Master-thesis/Data/QA/embeddings/netbert_embeddings.h5',
    questions_filepath='/raid/antoloui/Master-thesis/Data/QA/questions.csv', 
    questions_type='Knowledge'
)


Load chunks and their embeddings...

Create FAISS (GPU) index...

Load questions...

Encode questions...
   Loading pretrained model/tokenizer...
   Tokenizing sentences...
   Padding/Truncating sentences to 70 tokens...
   Creating attention masks...
   Converting inputs to torch tensors...
   Encoding sentences...

Perform evaluation...

QUESTION 0: 'Which protocols are examples of TCP/IP transport layer protocols? '
ANSWER: 'UDP;TCP'
  SCORE: '100.00'  -  Chunk: 1

QUESTION 1: 'Which protocols are examples of TCP/IP data link layer protocols? '
ANSWER: 'Ethernet;PPP'
  SCORE: '100.00'  -  Chunk: 1

QUESTION 2: 'The process of HTTP asking TCP to send some data and making sure that it is received correctly is an example of what? '
ANSWER: 'Adjacent-layer interaction'
  SCORE: '0.00'  -  Chunk: 0

QUESTION 3: 'The process of TCP on one computer marking a TCP segment as segment 1, and the receiving computer then acknowledging the receipt of TCP segment 1 is an example of what? '
ANSWER

In [155]:
df

Unnamed: 0,Question,Answer,Search_score,Top_result_idx,result1,result2,result3,result4,result5,result6,...,result1_L2dist,result2_L2dist,result3_L2dist,result4_L2dist,result5_L2dist,result6_L2dist,result7_L2dist,result8_L2dist,result9_L2dist,result10_L2dist
0,Which protocols are examples of TCP/IP transpo...,UDP;TCP,100.0,2,Chapter 1: to TCP/IP Networking 33 1 Link App...,The key difference between TCP and UDP is that...,Example 17-3 Verifying IP Addresses on Cisco R...,Chapter 1: to TCP/IP Networking 21 1 To help ...,"They can be used to query, compile, store, and...",The link layer includes wide-area network 30 ...,...,19.925415,20.970161,21.385132,21.745728,21.897537,21.912086,21.933212,21.979645,21.985237,21.993469
1,Which protocols are examples of TCP/IP data li...,Ethernet;PPP,100.0,2,Chapter 1: to TCP/IP Networking 33 1 Link App...,Table 3-3 Comparing HDLC Header Fields to Ethe...,So the comparison of OSI layers to other proto...,Example 17-3 Verifying IP Addresses on Cisco R...,Chapter 1: to TCP/IP Networking 21 1 To help ...,The link layer includes wide-area network 30 ...,...,18.387703,19.333382,19.536171,19.566399,19.644218,19.686020,19.714355,20.142326,20.228050,20.335007
2,The process of HTTP asking TCP to send some da...,Adjacent-layer interaction,0.0,1,"On the exam, you should focus on isolating the...","Note that, even though it isn’t shown in the f...",NOTE The full version of most web addresses—al...,ACK=1000 Window=3000 SEQ=1000 SEQ=2000 SEQ=300...,Which of my Applications gets the data in each...,The first part gives advice about common probl...,...,16.544914,16.702370,16.829109,17.010269,17.101677,17.213348,17.221924,17.240387,17.250977,17.488907
3,The process of TCP on one computer marking a T...,Same-layer interaction,100.0,1,Note that this figure shows the same HTTP head...,"SYN, DPORT=80, SPORT=1027 Web Browser Web Serv...",Many protocols operate under these same concep...,"Note that, even though it isn’t shown in the f...",ACK=1000 Window=3000 SEQ=1000 SEQ=2000 SEQ=300...,"Clearly, the server did not receive the bytes ...",...,11.734894,12.190300,12.211540,12.456306,12.488312,12.570358,12.715797,12.788750,13.078743,13.101845
4,The process of a web server adding a TCP heade...,Data encapsulation,100.0,4,"A web browser, which is software installed on ...","Note that, even though it isn’t shown in the f...",NOTE The full version of most web addresses—al...,TCP/IP Original Link Application Transport Int...,The pop-up window should display the URI to wh...,"DNS defines protocols, as well as standards fo...",...,13.825691,14.119911,14.523613,14.598099,14.653305,14.660065,14.830582,14.879692,14.893784,14.966202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,What type of router memory is used to store th...,RAM,100.0,1,"Conversely, the speed command is an interface ...",Cisco loads flash memory with a single IOS whe...,3297 bytes copied in 0.492 secs (6701 bytes/se...,"For example, the console runs at a speed of 96...",Example 35-2 copy tftp flash Command Copies th...,The router will now go through the boot sequen...,...,17.283913,18.642555,18.826500,19.314056,19.606171,19.835243,19.986214,20.011642,20.181213,20.466949
86,What is the name of the new Cisco IOS image fi...,Universal,100.0,4,verify /md5 filesystem:name [MD5-hash] Perform...,While the IOS for each type of device has some...,Cisco loads flash memory with a single IOS whe...,Around the late 2000s Cisco introduced a new p...,Example 35-6 SCP Client IOS Copy from a Mac to...,"IOS Images per Model, Series, and per Software...",...,15.929375,16.520851,16.831612,17.419342,17.424011,17.614838,17.630470,17.792625,17.942276,17.984657
87,What command enables you to show the UDI of yo...,show license udi,100.0,1,Table 36-3 Chapter Review Tracking Review Elem...,The real world uses paper receipts to show tha...,With access to the router console and the abil...,verify /md5 filesystem:name [MD5-hash] Perform...,ROMMON contains a small and different set of C...,Table 34-7 Device Hardening Configuration Comm...,...,17.079720,17.845657,17.868271,18.071480,18.151299,18.165085,18.321045,18.425659,18.536514,18.550331
88,What is a CLI command on a router that is usef...,license install url,100.0,3,verify /md5 filesystem:name [MD5-hash] Perform...,For example: ■ show running-config command: Re...,Table 36-3 Chapter Review Tracking Review Elem...,Around the late 2000s Cisco introduced a new p...,Cisco Product License Registration Portal Rout...,"Later, the customer could choose to use softwa...",...,13.113739,13.703079,13.826294,13.887352,13.909370,13.977081,14.274277,14.320496,14.408119,14.497597


## BERT

In [None]:
run(model_name_or_path='bert-base-cased',
    embeddings_filepath='/raid/antoloui/Master-thesis/Data/QA/embeddings/bert_embeddings.h5',
    questions_filepath='/raid/antoloui/Master-thesis/Data/QA/questions.csv')