# FAISS
Facebook AI Similarity Search (FAISS) library, which has excellent GPU implementation of "brute-force" kNN (meaning that no approximation techniques compromising the accuracy of the search).

In [None]:
import sys
import time
import re
import string

import pandas as pd
import numpy as np

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

import faiss
from nltk.corpus import stopwords


def load_chunks(filepath):
    """
    Load the chunk embeddings of the text file.
    """
    # Load dataframe.
    df = pd.read_hdf(filepath)
    
    # Get chunks and their embeddings.
    chunks = df.iloc[:,-1].values
    embeddings = df.iloc[:,:-1].values
    embeddings = np.ascontiguousarray(embeddings, dtype=np.float32) # Necessary for FAISS indexing afterwards.
    
    return chunks, embeddings


def create_gpu_index(vecs, n_gpus, method):
    """
    Create FAISS index on GPU(s).
    To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. 
    Note that a "flat" index means that it is brute-force, with no approximation techniques.
    """
    # Build flat CPU index.
    if method=='l2':
        cpu_index = faiss.IndexFlatL2(vecs.shape[1])  # Exact Search for L2
    elif method=='ip':
        cpu_index = faiss.IndexFlatIP(vecs.shape[1])  # Exact Search for Inner Product (also for cosine, just normalize vectors beforehand)
    else:
        print("Error: Please choose between L2 distance ('l2') or Inner Product ('ip') as brute-force method for exact search. Exiting...")
        sys.exit(0)
    
    # Convert to flat GPU index.
    co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index,co=co, ngpu=n_gpus)  # Convert CPU index to GPU index.
    
    # Add vectors to GPU index.
    gpu_index.add(vecs)

    return gpu_index


def load_questions(filepath, questions_type):
    """
    """
    # Load dataframe.
    df = pd.read_csv(filepath, sep=";")

    if questions_type is None:
        # Get all questions and their associated answers.
        questions = df.Question.values
        answers = df.Answer.values
    else:
        # Get questions of given type and associated answers.
        questions = df[df['QuestionType'] == questions_type].Question.values
        answers = df[df['QuestionType'] == questions_type].Answer.values
    
    return questions, answers


def encode_sentences(model_name_or_path, cache, sentences):
    """
    Given a list of sentences and a model, get the embeddings of theses sentences
    as the average of the word embeddings of the last layer.
    """
    print("   Loading pretrained model/tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir=cache) # Will output all hidden_states.

    print("   Tokenizing sentences...")
    tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

    lengths = [len(i) for i in tokenized]
    max_len = max(lengths) if max(lengths) <= 512 else 512

    print("   Padding/Truncating sentences to {} tokens...".format(max_len))
    padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                          value=0, truncating="post", padding="post")

    print("   Creating attention masks...")
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    print("   Converting inputs to torch tensors...")
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    print("   Encoding sentences...")
    with torch.no_grad():
        # output is a 2-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        output = model(input_ids, attention_mask=attention_mask)

    # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
    last_hidden_states = output[0]
    sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_mask)]
    sentence_embeddings = np.array(sentence_embeddings)
    
    return sentence_embeddings


def remove_punctuation(text):
    """
    Remove common punctuations.
    """
    return re.sub('([.,:;!?{}()])', r'', text)


def process_text(text):
    """
    """
    # Remove punctuations.
    processed = remove_punctuation(text)
    
    # Lower text.
    processed = processed.lower()
    
    # Remove stopwords.
    processed = [w for w in processed.split() if w not in stopwords.words('english')]
    return processed


def compute_score(chunk, answer):
    """
    """
    # Get all words from answer present in the chunk.
    words = [w for w in answer if w in chunk]
    
    # Define the score as the percentage of words from answer present in the chunk.
    score = len(words)/len(answer)
    return score


def run(model_name_or_path, embeddings_filepath, questions_filepath, questions_type=None,
        cache='/raid/antoloui/Master-thesis/Code/_cache/', method='l2', n_gpus=1, topk=10):
    """
    """
    print("\nLoad chunks and their embeddings...") 
    chunks, embeddings = load_chunks(embeddings_filepath)
    
    print("\nCreate FAISS (GPU) index...")
    index = create_gpu_index(vecs=embeddings, 
                             n_gpus=n_gpus, 
                             method=method)
    
    print("\nLoad questions...")
    questions, answers = load_questions(questions_filepath, questions_type)
    
    print("\nEncode questions...")
    quest_embeddings = encode_sentences(model_name_or_path=model_name_or_path, 
                                             cache=cache,
                                             sentences=questions)
    
    print("\nPerform evaluation...")
    # For each question-answer pair...
    for i, (Q, E, A) in enumerate(zip(questions, quest_embeddings, answers)):
        
        print("\nQUESTION {}: '{}'".format(i, Q))
        print("ANSWER: '{}'".format(A))
        
        # Process the answer.
        multiple_ans = A.split(';')
        processed_ans = [process_text(ans) for ans in multiple_ans]

        # Find topk chunks with FAISS search.
        result_dist, result_idx = index.search(E.reshape(1,768), k=topk)

        # For each result chunk, compute a score. 
        # The score si defined as the percentage of words in the answer that appears in that chunk. If multiple answers are possible, the max score is taken.
        scores = []
        for (idx,dist) in enumerate(zip(result_idx[0],result_dist[0])):

            # Get the chunk and process it.
            processed_chunk = process_text(chunks[idx])

            # Compute a score for that chunk according to each answer.
            scores_by_ans = [compute_score(processed_chunk, ans) for ans in processed_ans]

            # Take the maximum score out of the possible answers.
            score = max(scores_by_ans)

            # Append it to the scores list for topk chunks.
            scores.append(score)

        # Get the final scores as (1) the mean score, (2) the max score.
        max_score = max(scores)
        mean_score = sum(scores)/len(scores)
        print("  MAX SCORE: '{}'".format(max_score))
        print("  MEAN SCORE: '{}'".format(mean_score))
        
    
    print("\nDone.")

## NetBERT

In [None]:
run(
    model_name_or_path='/raid/antoloui/Master-thesis/Code/_models/netbert-830000/',
    embeddings_filepath='/raid/antoloui/Master-thesis/Data/QA/embeddings/netbert_embeddings.h5',
    questions_filepath='/raid/antoloui/Master-thesis/Data/QA/questions.csv', 
    questions_type='Knowledge'
)

## BERT

In [None]:
run(model_name_or_path='bert-base-cased',
    embeddings_filepath='/raid/antoloui/Master-thesis/Data/QA/embeddings/bert_embeddings.h5',
    questions_filepath='/raid/antoloui/Master-thesis/Data/QA/questions.csv')