# Cisco Corpus Search
Given a query (question), search the best corresponding chunks in the whole Cisco Corpus using Facebook AI Similarity Search (FAISS) library. 

**FAISS**: FAISS has excellent GPU implementation of "brute-force" kNN (meaning that no approximation techniques compromising the accuracy of the search).

## 1. Initialization

The 'Initialization' cell will load in memory the FAISS index created for the whole Cisco corpus. Hence, that cell needs to be run only once after starting the notebook.

**Instructions**

1. Run the cell.
2. Choose the number of GPUs you want to use for the search (the more the faster).
3. Click on the 'Init' button.

In [1]:
import os
import pickle
import time
import datetime

import faiss
import ipywidgets as widgets


#---------------------------------------------------------------------------
#                                  CODE
#---------------------------------------------------------------------------
def format_time(elapsed):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


def init(index_dir, n_gpu):
    """
    Load FAISS index and all text chunks.
    """
    # Load FAISS index.
    if os.path.exists(os.path.join(index_dir, "cisco_corpus.index")):
        index = faiss.read_index(os.path.join(index_dir, "cisco_corpus.index"))
        if n_gpu > 0 and faiss.get_num_gpus() > 0:
            if n_gpu > faiss.get_num_gpus(): n_gpu = faiss.get_num_gpus()
            co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
            co.shard = True
            index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu)  # Convert CPU index to GPU index.
    else:
        print("Error: no index found in {}... Make sure to create the index before searching in corpus. Exiting...".format(index_dir))
        sys.exit(0)
        
    # Load text chunks.
    if os.path.exists(os.path.join(index_dir, "cisco_chunks.txt")):
        with open(os.path.join(index_dir, "cisco_chunks.txt"), "rb") as f:
            chunks = pickle.load(f)
    else:
        print("Error: no chunks found in {}... Make sure to create the index before searching in corpus. Exiting...".format(index_dir))
        sys.exit(0)
    
    return index, chunks


#---------------------------------------------------------------------------
#                                 WIDGETS
#---------------------------------------------------------------------------
# GPUs dropdown list.
nb_gpus = list(range(faiss.get_num_gpus()+1))
nb_gpus = list(map(str, nb_gpus))
choose_gpus = widgets.Dropdown(
    options=nb_gpus,
    value='8',
    description='GPUs:',
    disabled=False,
)

# Button for loading FAISS index.
init_btn = widgets.Button(description='Init')
init_out = widgets.Output()
def init_btn_eventhandler(obj):
    init_out.clear_output()  # Clear output.
    gpus = int(choose_gpus.value)  # Get nb GPUs to use.
    with init_out:
        print("\nLoading FAISS index with {} GPUs...".format(gpus))
        t0 = time.time()
        global index, chunks
        index, chunks = init(index_dir='/raid/antoloui/Master-thesis/_data/search/cisco/', n_gpu=gpus)
        print("  Done.  -  Took: {}".format(format_time(time.time() - t0)))
init_btn.on_click(init_btn_eventhandler)

# Display widgets.
box = widgets.HBox([choose_gpus, init_btn])
widgets.VBox([box, init_out])

Using TensorFlow backend.


VBox(children=(HBox(children=(Dropdown(description='GPUs:', index=8, options=('0', '1', '2', '3', '4', '5', '6…

## 2. Search

Given a query, the 'Search' cell will return the top-k most similar text chunks from the Cisco corpus.

**Instructions**

1. Run the cell.
2. Type a query.
3. Choose the number of results to display.
4. Click on the 'Search' button.

In [7]:
import torch
import numpy as np

from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences


#---------------------------------------------------------------------------
#                                  CODE
#---------------------------------------------------------------------------
def encode_queries(sentences, model_name_or_path='/raid/antoloui/Master-thesis/_models/netbert-final/'):
    """
    Given a list of sentences, get the embeddings of their embeddings with NetBERT
    as the average of the word embeddings of the last layer (padding tokens excluded).
    """
    # Loading pretrained model/tokenizer.
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True) # Will output all hidden_states.

    #Tokenizing sentences.
    tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

    lengths = [len(i) for i in tokenized]
    max_len = max(lengths) if max(lengths) <= 512 else 512

    #Padding/Truncating sentences to max_len tokens.
    padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                           value=0, truncating="post", padding="post")

    #Creating attention masks.
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    # Converting inputs to torch tensors.
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    # Encoding sentences.
    with torch.no_grad():
        # output is a 2-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        output = model(input_ids, attention_mask=attention_mask)

    # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
    last_hidden_states = output[0]
    sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_mask)]
    sentence_embeddings = np.array(sentence_embeddings)
    
    return sentence_embeddings


def search_corpus(query, index, chunks, topk):
    """
    """
    # Encode query with NetBERT.
    query_embedding = encode_queries([query])[0]
    
    # Search topk results.
    result_dist, result_idx = index.search(query_embedding.reshape(1,768), k=topk)
    
    # Display topk results.
    for i, (idx, dist) in enumerate(zip(result_idx[0], result_dist[0])):
        print("\nTop {} result - (L2: {:.3f})".format(i+1, dist))
        print("---------------------------")
        print(chunks[idx])


#---------------------------------------------------------------------------
#                                 WIDGETS
#---------------------------------------------------------------------------
# Text widget for typing in query.
choose_query = widgets.Text(
    value='',
    placeholder='Type your query',
    description='Query:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)


# Slider widget for choosing topk results value.
choose_topk = widgets.IntSlider(
    value=5,
    min=1,
    max=50,
    step=1,
    description='Topk:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Button widget for launching the search.
search_btn = widgets.Button(description='Search')
search_out = widgets.Output()
def search_btn_eventhandler(obj):
    search_out.clear_output()  # Clear output.
    with search_out:
        search_corpus(query=choose_query.value, index=index, chunks=chunks, topk=choose_topk.value)
search_btn.on_click(search_btn_eventhandler)

# Display widgets.
box = widgets.HBox([choose_query, choose_topk, search_btn])
widgets.VBox([box, search_out])

VBox(children=(HBox(children=(Text(value='', description='Query:', layout=Layout(width='50%'), placeholder='Ty…