# Installing dependencies

In [None]:
%pip install torch==1.12.1+cu116 sentence-transformers faiss-gpu protobuf==3.20.0

Unpack model if available. In this case the model is compressed in search_model.zip

In [None]:
!unzip search_model.zip

In [None]:
!nvidia-smi

Main dependencies:

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
import faiss

Helper modules:

In [None]:
import pandas as pd
import time
import os
import gc
import random
from tqdm import tqdm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Data loading and preprocessing

In [None]:
data = pd.read_json('courses_dataset.json')
data.info()

Data preprocessing:

In [None]:
df = data[['CS_NAME','CS_DESC_LONG']]
del data
gc.collect()

df.dropna(inplace=True)
df.drop_duplicates(subset=['CS_DESC_LONG'], inplace=True)

Plotting data length distribution:

In [None]:
df['doc_len'] = df['CS_NAME'].apply(lambda words: len(words.split()))
max_seq_len = np.round(df['doc_len'].mean() + df['doc_len'].std()).astype(int)
sns.distplot(df['doc_len'], hist=True, kde=True, color='b', label='doc len')
plt.axvline(x=max_seq_len, color='k', linestyle='--', label='max len')
plt.title('plot length'); plt.legend()
plt.show()

# Two implementations of the search engine

First, we define methods used in both implementations:

In [None]:
def fetch_course_info(dataframe_idx):
    info = df.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['CS_NAME'] = info['CS_NAME']
    return meta_dict

def search(query, top_k, index, model):
    t=time.time()
    # query_vector = model.encode([query])
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    print('>>>> Results in Total Time: {}'.format(time.time()-t))
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    results =  [fetch_course_info(idx) for idx in top_k_ids]
    return results

def query_test(query, index, model):
    results=search(query, top_k=5, index=index, model=model)

    print("\n")
    for result in results:
        print('\t',result)

paragraphs = df.CS_DESC_LONG.tolist()


## 1- Search engine with pretrained RoBERTa

In [None]:
!pip install protobuf==3.20

In [None]:
import torch
device = torch.device('cuda')

In [None]:
torch.multiprocessing.set_start_method('spawn')

In [None]:
model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
model.to(device)
corpus_embeddings = model.encode(paragraphs, show_progress_bar=True, convert_to_numpy=True)

In [None]:
encoded_data = np.asarray(corpus_embeddings.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
ids = np.array(range(0, len(df)))

In [None]:
index.add_with_ids(encoded_data, ids)
faiss.write_index(index, 'course_description.index')

Testing the search function:

In [None]:
query_test('Python Entwicklung', index, model)
query_test('DevOps Azure CI/CD', index, model)

## 2- Search engine with fine-tuned RoBERTa fine-tuned
This time we are fine-tuning the same model with a synthetically generated queries from the original dataset using a T5 model specialized in generating synthetic data.

NOTE: If you are loading a previously stored model bypass steps 1,2, and 3.

### 1- Loading pretrained T5 model for generating synthetic queries

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model.eval()

In [None]:
#Select the device
device = 'cuda'
model.to(device)

In [None]:
# Parameters for generation
batch_size = 2              #Batch size
num_queries = 3             #Number of queries to generate for every paragraph
max_length_paragraph = 512  #Max length for paragraph
max_length_query = 48       #Max length for output query

### 2- Generating synthetic queries

In [None]:
def generate_senthetic_queries():
    with open('generated_queries_all.tsv', 'w') as fOut:
        for start_idx in tqdm(range(0, len(paragraphs), batch_size)):
            sub_paragraphs = paragraphs[start_idx:start_idx+batch_size]
            inputs = tokenizer.prepare_seq2seq_batch(sub_paragraphs, max_length=max_length_paragraph, truncation=True, return_tensors='pt').to(device)
            outputs = model.generate(
                **inputs,
                max_length=max_length_query,
                do_sample=True,
                top_p=0.95,
                num_return_sequences=num_queries)

            for idx, out in enumerate(outputs):
                query = tokenizer.decode(out, skip_special_tokens=True)
                para = sub_paragraphs[int(idx/num_queries)]
                fOut.write("{}\t{}\n".format(query.replace("\t", " ").strip(), para.replace("\t", " ").strip()))
# call the following function in case the file generated_queries_all.tsv is not present
if not os.path.exists('generated_queries_all.tsv'):
    generate_senthetic_queries()

### 3- fine-tuning the roberta model with the generated queries from the T5 model

In [None]:
def fine_tune():
    train_examples = [] 
    with open('generated_queries_all.tsv') as fIn:
        for line in fIn:
            try:
                query, paragraph = line.strip().split('\t', maxsplit=1)
                train_examples.append(InputExample(texts=[query, paragraph]))
            except:
                pass
            
    random.shuffle(train_examples)

    train_examples = train_examples[:2000]

    # For the MultipleNegativesRankingLoss, it is important
    # that the batch does not contain duplicate entries, i.e.
    # no two equal queries and no two equal paragraphs.
    # To ensure this, we use a special data loader
    train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=4)

    # Now we create a SentenceTransformer model from scratch
    word_emb = models.Transformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_emb, pooling])

    # MultipleNegativesRankingLoss requires input pairs (query, relevant_passage)
    # and trains the model so that is is suitable for semantic search
    train_loss = losses.MultipleNegativesRankingLoss(model)

    #Tune the model
    num_epochs = 3
    warmup_steps = int(len(train_dataloader) * num_epochs * 0.05)
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, use_amp=False, show_progress_bar=True)

    os.makedirs('search', exist_ok=True)
    model.save('search/search-model')


# call the following function in case the folder search is not present or the folder is empty of the file search-model is not present
if not os.path.exists('search') or not os.listdir('search'):
    fine_tune()

zip the new model

In [None]:
!zip -r search_model.zip "search"

### 4- Use FAISS to create an index of the encoded data (for fast nearest neighbor search)

In [None]:
model = SentenceTransformer('search/search-model')

In [None]:
encoded_data = model.encode(df.CS_NAME.tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(df))))
faiss.write_index(index, 'course_description.index')

### 5- Testing the new model

In [None]:
query_test('Python Entwicklung', index, model)
query_test('DevOps Azure CI/CD', index, model)