In [None]:
!pip install sentence-transformers -q

In [None]:
import os
import time
import ast
import gc
import random
import warnings
import multiprocessing as mp
from tqdm import tqdm
from types import SimpleNamespace

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import Sampler
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler, autocast
import torch.nn.functional as F
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)

from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore')

# Read data

In [None]:
train = pd.read_csv('/content/fc_qa_dataset.csv')

# Baseline

In [None]:
def get_common_questions(text):
    common_questions = []
    for ele in text.split('\n'):
        ele = ele.strip()
        if ele.startswith('-'):
            ele = ele.replace('-', '')
            ele = ele.strip()
            common_questions.append(ele)
    return common_questions

In [None]:
train['common_questions'] = train['common_questions'].apply(get_common_questions)

train['generated_questions'] = train['generated_questions'].apply(lambda x: ast.literal_eval(x))

In [None]:
user_queries = train['common_questions'].sum()
labels = (train['common_questions'].apply(len) * train.index.to_series().apply(lambda x: [x])).sum()

In [45]:
def calc_top_k_accuracy(top_k_preds, true_labels):
    binary_label_masks = []
    for top_k_pred, true_label in zip(top_k_preds, true_labels):
        if true_label in top_k_pred:
            binary_label_masks.append(1)
        else:
            binary_label_masks.append(0)
    accuracy = np.mean(binary_label_masks)
    return accuracy

In [None]:
model_paths = [
    'intfloat/multilingual-e5-small',
    'intfloat/multilingual-e5-base',
    'intfloat/multilingual-e5-large',
    'bkai-foundation-models/vietnamese-bi-encoder',
    'VoVanPhuc/sup-SimCSE-VietNamese-phobert-base'
]

for model_path in model_paths:
    embed_model = SentenceTransformer(model_path)

    desc_embed = embed_model.encode(
        train['api_desc'],
        batch_size=16,
        device='cuda',
        convert_to_tensor=True,
        normalize_embeddings=True,
        # show_progress_bar=True
    )

    top_1_preds = []
    top_3_preds = []
    top_5_preds = []
    for query in user_queries:
        question_embed = embed_model.encode(query, device='cuda', convert_to_tensor=True, normalize_embeddings=True)
        scores = question_embed @ desc_embed.T
        top_1_pred = scores.argmax(dim=-1).cpu().numpy().tolist()
        top_3_pred = scores.topk(3).indices.cpu().tolist()
        top_5_pred = scores.topk(5).indices.cpu().tolist()

        top_1_preds.append(top_1_pred)
        top_3_preds.append(top_3_pred)
        top_5_preds.append(top_5_pred)

    top_1_acc = accuracy_score(top_1_preds, labels)
    top_3_acc = calc_top_k_accuracy(top_3_preds, labels)
    top_5_acc = calc_top_k_accuracy(top_5_preds, labels)

    print(f'### {model_path}')
    print(f'Accuracy@1: {top_1_acc}')
    print(f'Accuracy@3: {top_3_acc}')
    print(f'Accuracy@5: {top_5_acc}\n')

### intfloat/multilingual-e5-small
Accuracy@1: 0.7805907172995781
Accuracy@3: 0.8860759493670886
Accuracy@5: 0.9282700421940928

### intfloat/multilingual-e5-base
Accuracy@1: 0.7383966244725738
Accuracy@3: 0.890295358649789
Accuracy@5: 0.9240506329113924

### intfloat/multilingual-e5-large
Accuracy@1: 0.7215189873417721
Accuracy@3: 0.869198312236287
Accuracy@5: 0.9324894514767933

### bkai-foundation-models/vietnamese-bi-encoder
Accuracy@1: 0.46835443037974683
Accuracy@3: 0.7257383966244726
Accuracy@5: 0.7890295358649789





### VoVanPhuc/sup-SimCSE-VietNamese-phobert-base
Accuracy@1: 0.37130801687763715
Accuracy@3: 0.5189873417721519
Accuracy@5: 0.6455696202531646



# Fine-tuning

In [31]:
cfg = {
    'model_name': 'intfloat/multilingual-e5-small',
    'batch_size': 32,
    'max_length': 512,
    'epochs': 2,
    'learning_rate': 2e-4,
    'warmup_steps': 0,
    'weight_decay': 0.1,
    'intermediate_dropout': 0.,
    'num_workers': mp.cpu_count(),
    'seed': 252,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

cfg = SimpleNamespace(**cfg)

In [32]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

In [33]:
class VTNetDataset(Dataset):
	def __init__(self, encodings_1, encodings_2):
		self.encodings_1 = encodings_1
		self.encodings_2 = encodings_2

	def __getitem__(self, idx):
		item = {f'{key}_1': torch.tensor(val[idx]) for key, val in self.encodings_1.items()}
		item.update(
            {f'{key}_2': torch.tensor(val[idx]) for key, val in self.encodings_2.items()}
        )
		return item

	def __len__(self):
		return self.encodings_1.input_ids.shape[0]

In [34]:
def get_dataloader(tokenizer, questions, descriptions, mode, batch_size, max_length, num_workers):

	encodings_1 = tokenizer(
		questions,
		padding='max_length',
		truncation=True,
		max_length=max_length,
		return_tensors='pt'
	)

	encodings_2 = tokenizer(
		descriptions,
		padding='max_length',
		truncation=True,
		max_length=max_length,
		return_tensors='pt'
	)

	dataset = VTNetDataset(encodings_1, encodings_2)

	if mode == 'train':
		data_loader = DataLoader(
			dataset=dataset,
			batch_size=batch_size,
			drop_last=True,
			shuffle=True,
			num_workers=num_workers
		)

	else:
		data_loader = DataLoader(
			dataset=dataset,
			batch_size=batch_size,
			drop_last=False,
			shuffle=False,
			num_workers=num_workers
		)

	return data_loader

In [35]:
api_questions = train['generated_questions'].sum()
api_descriptions = (train['generated_questions'].apply(len) * train['api_desc'].apply(lambda x: [x])).sum()

train_dataloader = get_dataloader(
    tokenizer=tokenizer,
    questions=api_questions,
    descriptions=api_descriptions,
    mode='train',
    batch_size=cfg.batch_size,
    max_length=cfg.max_length,
    num_workers=cfg.num_workers,
)

In [36]:
def set_seed(seed=318):
	random.seed(seed)
	np.random.seed(seed)
	os.environ['PYTHONHASHSEED'] = str(seed)
	# os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	# torch.use_deterministic_algorithms(True)

In [37]:
class MultiNegativesRankingLoss(nn.Module):
    """
    Ref: https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/losses/MultipleNegativesRankingLoss.py
    """
    def __init__(self, scale=50):
        super().__init__()
        self.cross_entropy = nn.CrossEntropyLoss(reduction='mean')
        self.scale = scale

    def forward(self, embed_1, embed_2, labels=None):
        cosine_scores = (
            F.normalize(embed_1) @ F.normalize(embed_2).T
        ) * self.scale

        labels = torch.tensor(
            range(len(cosine_scores)),
            dtype=torch.long,
            device=cosine_scores.device
        )

        loss = self.cross_entropy(cosine_scores, labels)
        return loss


loss_fn = MultiNegativesRankingLoss()

In [38]:
class TextMeanPooling(nn.Module):
    def __init__(self, eps=1e-06):
        super(TextMeanPooling, self).__init__()
        self.eps = eps

    def forward(self, token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        mean_embeds = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=self.eps)
        return mean_embeds

In [39]:
class VTNetEmbedModel(nn.Module):
    def __init__(self, cfg):
        super(VTNetEmbedModel, self).__init__()

        config = AutoConfig.from_pretrained(cfg.model_name)
        config.attention_probs_dropout_prob = cfg.intermediate_dropout
        config.hidden_dropout_prob = cfg.intermediate_dropout

        self.backbone = AutoModel.from_pretrained(cfg.model_name, config=config)
        self.backbone.gradient_checkpointing_enable()

        self.pooler = TextMeanPooling()
        self.loss_fn = MultiNegativesRankingLoss()


    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        embed_1 = self.backbone(input_ids_1, attention_mask_1).last_hidden_state
        embed_2 = self.backbone(input_ids_2, attention_mask_2).last_hidden_state

        x_1 = self.pooler(embed_1, attention_mask_1)
        x_2 = self.pooler(embed_2, attention_mask_2)

        loss = self.loss_fn(x_1, x_2)

        return loss

In [40]:
set_seed(cfg.seed)
train_dataloader = get_dataloader(
    tokenizer=tokenizer,
    questions=api_questions,
    descriptions=api_descriptions,
    mode='train',
    batch_size=cfg.batch_size,
    max_length=cfg.max_length,
    num_workers=cfg.num_workers,
)

start_time = time.time()
scaler = GradScaler()

for epoch in range(cfg.epochs):

    model = VTNetEmbedModel(cfg)
    model.to(cfg.device)

    model.train()

    optimizer = optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=cfg.warmup_steps,
        num_training_steps=len(train_dataloader)*cfg.epochs
    )

    for batch_idx, batch in enumerate(train_dataloader):
        input_ids_1 = batch['input_ids_1'].to(cfg.device)
        attention_mask_1 = batch['attention_mask_1'].to(cfg.device)
        input_ids_2 = batch['input_ids_2'].to(cfg.device)
        attention_mask_2 = batch['attention_mask_2'].to(cfg.device)

        with autocast():
            loss = model(
                input_ids_1=input_ids_1,
                attention_mask_1=attention_mask_1,
                input_ids_2=input_ids_2,
                attention_mask_2=attention_mask_2
            )

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

        if not batch_idx % 10:
            print(
                f'Epoch: {epoch + 1}/{cfg.epochs}'
                f' | Batch: {batch_idx}/{len(train_dataloader)}'
                f' | Loss: {loss.detach().cpu().item():.4f}')



Epoch: 1/2 | Batch: 0/16 | Loss: 1.8027
Epoch: 1/2 | Batch: 10/16 | Loss: 0.5589
Epoch: 2/2 | Batch: 0/16 | Loss: 1.7778
Epoch: 2/2 | Batch: 10/16 | Loss: 0.4032


In [41]:
class ValDataset(Dataset):
	def __init__(self, encodings):
		self.encodings = encodings

	def __getitem__(self, idx):
		item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
		return item

	def __len__(self):
		return self.encodings['input_ids'].shape[0]


encodings_1 = tokenizer(
    user_queries,
    padding='max_length',
    truncation=True,
    max_length=cfg.max_length,
    return_tensors='pt'
)


val_dataset_1 = ValDataset(encodings_1)
val_dataloader_1 = DataLoader(
    dataset=val_dataset_1,
    batch_size=cfg.batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=cfg.num_workers
)


encodings_2 = tokenizer(
    train['api_desc'].values.tolist(),
    padding='max_length',
    truncation=True,
    max_length=cfg.max_length,
    return_tensors='pt'
)


val_dataset_2 = ValDataset(encodings_2)
val_dataloader_2 = DataLoader(
    dataset=val_dataset_2,
    batch_size=cfg.batch_size,
    drop_last=False,
    shuffle=False,
    num_workers=cfg.num_workers
)

In [42]:
pooler = TextMeanPooling()
desc_embed = torch.tensor([], device=cfg.device)
with torch.no_grad():
    model.eval()
    for batch in val_dataloader_2:
        input_ids = batch['input_ids'].to(cfg.device)
        attention_mask = batch['attention_mask'].to(cfg.device)
        embed = model.backbone(input_ids, attention_mask).last_hidden_state
        mean_embed = pooler(embed, attention_mask)
        mean_embed = F.normalize(mean_embed, dim=-1)
        desc_embed = torch.cat((desc_embed, mean_embed), dim=0)

In [43]:
with torch.no_grad():
    model.eval()
    top_1_preds = []
    top_3_preds = []
    top_5_preds = []
    for batch in val_dataloader_1:
        input_ids = batch['input_ids'].to(cfg.device)
        attention_mask = batch['attention_mask'].to(cfg.device)
        question_embed = model.backbone(input_ids, attention_mask).last_hidden_state
        question_embed = pooler(question_embed, attention_mask)
        question_embed = F.normalize(question_embed, dim=-1)
        scores = question_embed @ desc_embed.T

        top_1_pred = scores.argmax(dim=-1).cpu().numpy().tolist()
        top_3_pred = scores.topk(3).indices.cpu().tolist()
        top_5_pred = scores.topk(5).indices.cpu().tolist()

        top_1_preds.extend(top_1_pred)
        top_3_preds.extend(top_3_pred)
        top_5_preds.extend(top_5_pred)

top_1_acc = accuracy_score(top_1_preds, labels)
top_3_acc = calc_top_k_accuracy(top_3_preds, labels)
top_5_acc = calc_top_k_accuracy(top_5_preds, labels)

print(f'Accuracy@1: {top_1_acc}')
print(f'Accuracy@3: {top_3_acc}')
print(f'Accuracy@5: {top_5_acc}')

Accuracy@1: 0.8734177215189873
Accuracy@3: 0.9662447257383966
Accuracy@5: 0.9873417721518988
