# RFC Corpus Search
Given a query (question), search the best corresponding chunks among all RFC standards listing [here](https://tools.ietf.org/rfc/index).

**FAISS**: The search operation is performed by Facebook AI Similarity Search (FAISS) library, which has an excellent GPU implementation of "brute-force" kNN (meaning that no approximation techniques compromising the accuracy of the search).

## 1. Initialization

The 'Initialization' cell will load in memory the FAISS index created for the corpus. Hence, that cell needs to be run only once after starting the notebook.

**Instructions**

1. Run the cell.
2. Choose the type of FAISS index.
3. Choose the number of GPUs you want to use for the search (the more the faster).
4. Click on the 'Init' button.

In [1]:
import os
import pickle
import time
import datetime

import faiss
import ipywidgets as widgets


#---------------------------------------------------------------------------
#                                  CODE
#---------------------------------------------------------------------------
def format_time(elapsed):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


def init(dirpath, method, n_gpu):
    """
    Load FAISS index and all text chunks.
    """
    # Get the method.
    index_name = 'l2' if method=='L2' else 'ip' if method=='Inner-Product' else 'cos'
    
    # Load FAISS index.
    if os.path.exists(os.path.join(dirpath, index_name + ".index")):
        index = faiss.read_index(os.path.join(dirpath, index_name + ".index"))
        if n_gpu > 0 and faiss.get_num_gpus() > 0:
            if n_gpu > faiss.get_num_gpus(): n_gpu = faiss.get_num_gpus()
            co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
            co.shard = True
            index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu)  # Convert CPU index to GPU index.
    else:
        print("Error: no index found in {}... Make sure to create the index before searching in corpus. Exiting...".format(dirpath))
        sys.exit(0)
        
    # Load text chunks.
    if os.path.exists(os.path.join(dirpath, "chunks.txt")):
        with open(os.path.join(dirpath, "chunks.txt"), "rb") as f:
            chunks = pickle.load(f)
    else:
        print("Error: no chunks found in {}... Make sure to create the index before searching in corpus. Exiting...".format(dirpath))
        sys.exit(0)
    
    return index, chunks


#---------------------------------------------------------------------------
#                                 WIDGETS
#---------------------------------------------------------------------------
# Index dropdown list.
options = ['L2', 'Inner-Product', 'Cosine']
choose_index = widgets.Dropdown(
    options=options,
    value='L2',
    description='Index:',
    disabled=False,
)

# GPUs dropdown list.
nb_gpus = list(range(faiss.get_num_gpus()+1))
nb_gpus = list(map(str, nb_gpus))
choose_gpus = widgets.Dropdown(
    options=nb_gpus,
    value='8',
    description='GPUs:',
    disabled=False,
)

# Button for loading FAISS index.
init_btn = widgets.Button(description='Init')
init_out = widgets.Output()
def init_btn_eventhandler(obj):
    init_out.clear_output()  # Clear output.
    gpus = int(choose_gpus.value) # Get the GPUs value from dropdown.
    with init_out:
        print("\nLoading FAISS index with {} GPUs...".format(gpus))
        t0 = time.time()
        global index, chunks
        index, chunks = init(dirpath='/raid/antoloui/Master-thesis/_data/search/rfc/index/',
                             method=choose_index.value, 
                             n_gpu=gpus)
        print("  Done.  -  Took: {}".format(format_time(time.time() - t0)))
init_btn.on_click(init_btn_eventhandler)

# Display widgets.
box = widgets.HBox([choose_index, choose_gpus, init_btn])
widgets.VBox([box, init_out])

VBox(children=(HBox(children=(Dropdown(description='Index:', options=('L2', 'Inner-Product', 'Cosine'), value=…

## 2. Search

Given a query, the 'Search' cell will return the top-k most similar text chunks from the corpus.

**Instructions**

1. Run the cell.
2. Type a query.
3. Choose the number of results to display.
4. Click on the 'Search' button.

In [3]:
import time
import torch
import numpy as np

from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences


#---------------------------------------------------------------------------
#                                  CODE
#---------------------------------------------------------------------------
def encode_queries(model, tokenizer, sentences):
    """
    Given a list of sentences, get the embeddings of their embeddings with NetBERT
    as the average of the word embeddings of the last layer (padding tokens excluded).
    """
    #Tokenizing sentences.
    tokenized = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

    lengths = [len(i) for i in tokenized]
    max_len = max(lengths) if max(lengths) <= 512 else 512

    #Padding/Truncating sentences to max_len tokens.
    padded = pad_sequences(tokenized, maxlen=max_len, dtype="long", 
                           value=0, truncating="post", padding="post")

    #Creating attention masks.
    attention_mask = np.where(padded != 0, 1, 0)  #returns ndarray which is 1 if padded != 0 is True and 0 if False.

    # Converting inputs to torch tensors.
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    # Encoding sentences.
    with torch.no_grad():
        # output is a 2-tuple where:
        #  - output[0] is the last_hidden_state, i.e a tensor of shape (batch_size, sequence_length, hidden_size).
        #  - output[1] is the pooler_output, i.e. a tensor of shape (batch_size, hidden_size) being the last layer hidden-state of the first token of the sequence (classification token).
        #  - output[2] are all hidden_states, i.e. a 13-tuple of torch tensors of shape (batch_size, sequence_length, hidden_size): 12 encoders-outputs + initial embedding outputs.
        output = model(input_ids, attention_mask=attention_mask)

    # For each sentence, take the embeddings of its word from the last layer and represent that sentence by their average.
    last_hidden_states = output[0]
    sentence_embeddings = [torch.mean(embeddings[:torch.squeeze((masks == 1).nonzero(), dim=1).shape[0]], dim=0).numpy() for embeddings, masks in zip(last_hidden_states, attention_mask)]
    sentence_embeddings = np.array(sentence_embeddings)
    
    return sentence_embeddings


def load_model(model_name_or_path):
    """
    """
    # Loading pretrained model/tokenizer.
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    model = BertModel.from_pretrained(model_name_or_path, output_hidden_states=True) # Will output all hidden_states.
    return model, tokenizer


def search_corpus(model, tokenizer, query, index, chunks, topk):
    """
    """
    start_total = time.time()
    
    # Encode query with NetBERT.
    start_encode = time.time()
    query_embedding = encode_queries(model, tokenizer, [query])[0]
    end_encode = time.time()
    
    # Search topk results.
    start_search = time.time()
    result_dist, result_idx = index.search(query_embedding.reshape(1,768), k=topk)
    end_search = time.time()
    
    end_total = time.time()
    
    # Display topk results.
    for i, (idx, dist) in enumerate(zip(result_idx[0], result_dist[0])):
        print("\nTop {} result - (Distance: {:.3f})".format(i+1, dist))
        print("---------------------------")
        print(chunks[idx])
        
    # Display search time.
    print("\n** Search time **")
    print("  Encoding query: {:.3f}".format(end_encode-start_encode))
    print("  Faiss search: {:.3f}".format(end_search-start_search))
    print("  (Total search time: {:.3f})".format(end_total - start_total))
    


#---------------------------------------------------------------------------
#                                 WIDGETS
#---------------------------------------------------------------------------
# Text widget for typing in query.
choose_query = widgets.Text(
    value='',
    placeholder='Type your query',
    description='Query:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)


# Slider widget for choosing topk results value.
choose_topk = widgets.IntSlider(
    value=5,
    min=1,
    max=50,
    step=1,
    description='Topk:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Button widget for launching the search.
search_btn = widgets.Button(description='Search')
search_out = widgets.Output()
model, tokenizer = load_model(model_name_or_path='/raid/antoloui/Master-thesis/_models/netbert-final/')
def search_btn_eventhandler(obj):
    search_out.clear_output()  # Clear output.
    global model
    global tokenizer
    with search_out:
        search_corpus(model=model, 
                      tokenizer=tokenizer,
                      query=choose_query.value, 
                      index=index, 
                      chunks=chunks, 
                      topk=choose_topk.value)
search_btn.on_click(search_btn_eventhandler)

# Display widgets.
box = widgets.HBox([choose_query, choose_topk, search_btn])
widgets.VBox([box, search_out])

Using TensorFlow backend.


VBox(children=(HBox(children=(Text(value='', description='Query:', layout=Layout(width='50%'), placeholder='Ty…

In [7]:
import os

directory = '/raid/antoloui/Master-thesis/_data/search/rfc/index/'
files = os.listdir(directory).sort()


# Check size of an index.
total_size = 0
for filename in sorted(os.listdir(directory)):
    file_path = directory + filename
    size = os.path.getsize(file_path)/(1024*1024*1024)
    total_size += size
    print("Size of '{}': {:.4f} GB".format(filename, size))

Size of 'chunks.txt': 0.4575 GB
Size of 'cos.index': 3.7392 GB
Size of 'ip.index': 3.7392 GB
Size of 'l2.index': 3.7392 GB


In [8]:
len(chunks)

1306931

In [13]:
import pandas as pd
filepath = '/raid/antoloui/Master-thesis/_data/search/rfc/info.csv'
df = pd.read_csv(filepath) 
df

Unnamed: 0.1,Unnamed: 0,Name,Title,Authors,Date,Formats,Obsolotes,Obsoloted_by,Updates,Updated_by,Also_FYI,Status,DOI
0,0,rfc1,Host Software,S. Crocker,April 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0001
1,1,rfc2,Host software,B. Duvall,April 1969,"TXT, PDF, HTML",,,,,,UNKNOWN,10.17487/RFC0002
2,2,rfc3,Documentation conventions,S.D. Crocker,April 1969,"TXT, HTML",,RFC0010,,,,UNKNOWN,10.17487/RFC0003
3,3,rfc4,Network timetable,E.B. Shapiro,March 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0004
4,4,rfc5,Decode Encode Language (DEL),J. Rulifson,June 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8565,8571,rfc8769,Cryptographic Message Syntax (CMS) Content Ty...,J. Schaad,March 2020,"HTML, TXT, PDF, XML",,,,,,INFORMATIONAL,10.17487/RFC8769
8566,8572,rfc8770,Host Router Support for OSPFv2,"K. Patel, P. Pillay-Esnault, M. Bhardwaj, S. B...",April 2020,"HTML, TXT, PDF, XML",,,RFC6987,,,PROPOSED STANDARD,10.17487/RFC8770
8567,8573,rfc8771,The Internationalized Deliberately Unreadable...,"A. Mayrhofer, J. Hague",1 April 2020,"HTML, TXT, PDF, XML",,,,,,EXPERIMENTAL,10.17487/RFC8771
8568,8574,rfc8773,TLS 1.3 Extension for Certificate-Based Authe...,R. Housley,March 2020,"HTML, TXT, PDF, XML",,,,,,EXPERIMENTAL,10.17487/RFC8773
