# FAISS
Facebook AI Similarity Search (FAISS) library, which has excellent GPU implementation of "brute-force" kNN (meaning that no approximation techniques compromising the accuracy of the search).

In [1]:
import sys
import time

import pandas as pd
import numpy as np

import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

import faiss

Using TensorFlow backend.


## 1. FAISS index

### 1.1. Load CCNA chunks

### 1.2. Create FAISS index

To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. Note that a "flat" index means that it is brute-force, with no approximation techniques.

In [5]:
def create_gpu_index(vecs, n_gpus, method='l2'):
    """
    Create FAISS index on GPU(s).
    """
    print("  Number of available GPUs: {}  -  Using: {}".format(faiss.get_num_gpus(), n_gpus))
    
    print("  Building flat CPU index...")
    if method=='l2':
        cpu_index = faiss.IndexFlatL2(vecs.shape[1])  # Exact Search for L2
    elif method=='ip':
        cpu_index = faiss.IndexFlatIP(vecs.shape[1])  # Exact Search for Inner Product (also for cosine, just normalize vectors beforehand)
    else:
        print("Error: Please choose between L2 distance ('l2') or Inner Product ('ip') as brute-force method for exact search. Exiting...")
        sys.exit(0)
    
    print("  Converting to flat GPU index...")
    co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index,co=co, ngpu=n_gpus)  # Convert CPU index to GPU index.
    
    print("  Adding vectors to GPU index...")
    gpu_index.add(vecs)

    return gpu_index

In [6]:
print("\nBuilding BERT-base FAISS index...")
netbert_index = create_gpu_index(vecs=bert_embeddings,
                             n_gpus=1,
                             method='l2')

print("\nBuilding NetBERT FAISS index...")
netbert_index = create_gpu_index(vecs=netbert_embeddings,
                             n_gpus=1,
                             method='l2')
print("\nDONE.")

Building BERT-base FAISS index...
  Number of available GPUs: 8  -  Using: 1
  Building flat CPU index...
  Converting to flat GPU index...
  Adding vectors to GPU index...
Building NetBERT FAISS index...
  Number of available GPUs: 8  -  Using: 1
  Building flat CPU index...
  Converting to flat GPU index...
  Adding vectors to GPU index...
DONE.


## 2. FAISS search

### 2.1. Load and encode questions

In [2]:
def load_questions(filepath):
    """
    """
    df = pd.read_csv(filepath, sep=";")
    questions = df.Question.values
    return df, questions


def encode_sentences(model_name_or_path, cache, sentences):
    """
    Given a list of sentences and a model, get the embeddings of theses sentences
    as the average of the word embeddings of the last layer.
    """
    print("   Loading pretrained model/tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True, cache_dir=cache) # Will output all hidden_states.

    print("   Tokenizing sentences...")
    tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

    lengths = [len(i) for i in tokenized]
    max_len = max(lengths) if max(lengths) <= 512 else 512

    print("   Padding/Truncating sentences to {} tokens...".format(max_len))
    padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                          value=0, truncating="post", padding="post")

    print("   Creating attention masks...")
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    print("   Converting inputs to torch tensors...")
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    print("   Encoding sentences...")
    with torch.no_grad():
        # output is a 2-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        output = model(input_ids, attention_mask=attention_mask)

    # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
    last_hidden_states = output[0]
    sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_mask)]
    sentence_embeddings = np.array(sentence_embeddings)
    
    return sentence_embeddings

In [4]:
print("\nLoading questions...")
df, questions = load_questions('/raid/antoloui/Master-thesis/Data/QA/questions.csv')

print("\nEncoding questions with BERT-base...")
bert_embeddings = encode_sentences(model_name_or_path='bert-base-cased', 
                                   cache='/raid/antoloui/Master-thesis/Code/_cache/',
                                   sentences=questions)

print("\nEncoding questions with NetBERT...")
netbert_embeddings = encode_sentences(model_name_or_path='/raid/antoloui/Master-thesis/Code/_models/netbert-830000/', 
                                      cache='/raid/antoloui/Master-thesis/Code/_cache/',
                                      sentences=questions)
print("\nDONE.")

Loading questions...

Encoding questions with BERT-base...
   Loading pretrained model/tokenizer...
   Tokenizing sentences...
   Padding/Truncating sentences to 125 tokens...
   Creating attention masks...
   Converting inputs to torch tensors...
   Encoding sentences...

Encoding questions with NetBERT...
   Loading pretrained model/tokenizer...
   Tokenizing sentences...
   Padding/Truncating sentences to 125 tokens...
   Creating attention masks...
   Converting inputs to torch tensors...
   Encoding sentences...

DONE.


### 2.2. Search with FAISS

In [None]:
# Take the question i
i = 4
question = questions.iloc[4].question

# Find the 5 most similar chunks.
D, I = gpu_index.search(vecs[i].reshape(1,768), k=5)

