In [1]:
!pip install -U transformers
!pip install faiss-cpu
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from transformers import BertTokenizer
from transformers import BertPreTrainedModel, BertModel
from transformers import FNetTokenizer, FNetModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [3]:
df = pd.read_csv('/content/drive/MyDrive/df_with_label.csv').sample(3000)

In [4]:
df.head()

Unnamed: 0,sentence_a,sentence_b,label
54680,In order to initiate development of a structur...,Some Acacia species are shade intolerant resul...,0
38812,We compared the set of genes correctly detecte...,PromoterInspector results were mapped to pseud...,1
10345,We have previously shown that two co-stimulato...,Transduced cells were stained for membrane exp...,1
88475,"Let us briefly review the Holm [13] method, wh...",This finding is inconsistent with the recent s...,0
5609,"We searched these genomes for a Pfam motif, PF...",An example being the enzyme glyceraldehyde-3-p...,1


In [5]:
train, val = train_test_split(df, test_size=0.25)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
fnet_tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')

In [6]:
class NegativeSamplingDataset(Dataset):
    """
    ToxicCommentsDataset is created to create a custom dataset.
    later we wrap a lightning data module around it.
    """

    def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        sent1 = data_row['sentence_a']
        sent2 = data_row['sentence_b']
        label = data_row['label'].flatten()

        encoding1 = self.tokenizer.encode_plus(

            sent1,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',

        )
        encoding2 = self.tokenizer.encode_plus(

            sent2,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',

        )

        return dict(
            input_ids1=encoding1["input_ids"].flatten(),
            attention_mask1=encoding1["attention_mask"].flatten(),
            input_ids2=encoding2["input_ids"].flatten(),
            attention_mask2=encoding2["attention_mask"].flatten(),
            labels=torch.tensor(label, dtype=torch.long))

In [7]:
train_dataset = NegativeSamplingDataset(train, bert_tokenizer, 64)
val_dataset = NegativeSamplingDataset(val, bert_tokenizer, 64)

bert_train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True,
                              num_workers=1)

bert_eval_dataloader = DataLoader(val_dataset, batch_size=16, num_workers=1)

In [8]:
train_dataset = NegativeSamplingDataset(train, fnet_tokenizer, 64)
val_dataset = NegativeSamplingDataset(val, fnet_tokenizer, 64)

fnet_train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True,
                              num_workers=1)

fnet_eval_dataloader = DataLoader(val_dataset, batch_size=16, num_workers=1)

In [9]:
class SBertModule(BertPreTrainedModel):
    def __init__(self, config):
        super(SBertModule, self).__init__(config)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, segment_ids=None, input_mask=None):
        outputs = self.bert(input_ids, segment_ids, input_mask)
        sequence_output, pooled_output = outputs[:2]
        
        input_mask_expanded = input_mask.unsqueeze(-1).expand(sequence_output.size()).float()
        sum_embeddings = torch.sum(sequence_output * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, 1e-9)
        mean_pooling_out = sum_embeddings / sum_mask  # [batch_size, hidden_size]
        return mean_pooling_out
            

class SBert(nn.Module):
    def __init__(self, model_path=None, config=None):
        super(SBert, self).__init__()
        self.num_labels = 2
        self.bert_module = SBertModule.from_pretrained(model_path)
        self.linear1 = nn.Linear(768 * 3, 768)
        self.linear2 = nn.Linear(768, self.num_labels)

    def forward(self, x_input_ids=None, x_segment_ids=None, x_input_mask=None,
                      y_input_ids=None, y_segment_ids=None, y_input_mask=None, labels=None, train_sbert=True):
      if train_sbert:
        u = self.bert_module(x_input_ids, x_segment_ids, x_input_mask)
        v = self.bert_module(y_input_ids, y_segment_ids, y_input_mask)
        uv = torch.sub(u, v)
        uv_abs = torch.abs(uv)
        output = torch.cat([u, v, uv_abs], dim=-1)
        
        output = F.relu(self.linear1(output))
        logits = self.linear2(output)
        return logits

      else:
        return self.bert_module(x_input_ids, x_segment_ids, x_input_mask)

In [10]:
class FnetModule(BertPreTrainedModel):
    def __init__(self, config):
        super(FnetModule, self).__init__(config)
        self.bert = FNetModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, segment_ids=None, input_mask=None):
        outputs = self.bert(input_ids, segment_ids, input_mask)
        sequence_output, pooled_output = outputs[:2]
        
        input_mask_expanded = input_mask.unsqueeze(-1).expand(sequence_output.size()).float()
        sum_embeddings = torch.sum(sequence_output * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, 1e-9)
        mean_pooling_out = sum_embeddings / sum_mask  # [batch_size, hidden_size]
        return mean_pooling_out
            

class SFnet(nn.Module):
    def __init__(self, model_path=None, config=None):
        super(SFnet, self).__init__()
        self.num_labels = 2
        self.fnet_module = FnetModule.from_pretrained(model_path)
        self.linear1 = nn.Linear(768 * 3, 768)
        self.linear2 = nn.Linear(768, self.num_labels)

    def forward(self, x_input_ids=None, x_segment_ids=None, x_input_mask=None,
                      y_input_ids=None, y_segment_ids=None, y_input_mask=None, labels=None, train_sbert=True):
      if train_sbert:
        u = self.fnet_module(x_input_ids, x_segment_ids, x_input_mask)
        v = self.fnet_module(y_input_ids, y_segment_ids, y_input_mask)
        uv = torch.sub(u, v)
        uv_abs = torch.abs(uv)
        output = torch.cat([u, v, uv_abs], dim=-1)
        
        output = F.relu(self.linear1(output))
        logits = self.linear2(output)
        return logits

      else:
        return self.fnet_module(x_input_ids, x_segment_ids, x_input_mask)

In [11]:
bert_model = SBert(model_path='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing SBertModule: ['cls.predictions.bias', 'cls.predictions.transform.

In [12]:
fnet_model = SFnet(model_path='google/fnet-base')

You are using a model of type fnet to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
You are using a model of type bert to instantiate a model of type fnet. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing FNetModel: ['bert.encoder.layer.3.output.dense.bias', 'bert.encoder.layer.3.attention.self.key.weight', 'bert.encoder.layer.10.output.dense.bias', 'bert.encoder.layer.11.intermediate.dense.bias', 'bert.encoder.layer.5.output.LayerNorm.bias', 'bert.encoder.layer.5.attention.self.query.bias', 'bert.encoder.layer.4.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.value.bias', 'bert.encoder.layer.4.output.LayerNorm.weight', 'bert.encoder.layer.5.output.dense.weight', 'bert.encoder.layer.1.attention.self.value.bias', 'bert.encode

In [13]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
learning_rate = 1e-05
bert_optimizer = torch.optim.Adam(params=bert_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
bert_scheduler = torch.optim.lr_scheduler.StepLR(bert_optimizer, step_size=2, gamma=0.1)
n_epochs=1

In [14]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
learning_rate = 1e-05
fnet_optimizer = torch.optim.Adam(params=fnet_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
fnet_scheduler = torch.optim.lr_scheduler.StepLR(fnet_optimizer, step_size=2, gamma=0.1)
n_epochs=1

In [15]:
def train_one_epoch(model, train_dataloader, criterion, optimizer, device="cuda:0"):
    model.to(device).train()
    with tqdm(total=len(train_dataloader)) as pbar:
        for batch in train_dataloader:
            # добавляем батч для вычисления на GPU
            # Распаковываем данные из dataloader
            input_ids1, attention_mask1, input_ids2, attention_mask2, labels = batch
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            # labels = torch.tensor(batch['labels'], dtype=torch.long).flatten().to(device)
            labels = batch['labels'].view(-1).to(device)

            optimizer.zero_grad()
            output = model.forward(x_input_ids=input_ids1, x_segment_ids=None, x_input_mask=attention_mask1,
                                   y_input_ids=input_ids2, y_segment_ids=None, y_input_mask=attention_mask2, labels=labels)
            
            _, predicted = torch.max(output, 1)
            
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            _, predicted = torch.max(output.detach(), 1)
            accuracy = accuracy_score(predicted.cpu().numpy(), labels.cpu().numpy())
            pbar.set_description('Loss: {:.4f}; Accuracy: {:.4f}'.format(loss.item(), accuracy))    
            pbar.update(1)

def predict(model, val_dataloader, criterion, device="cuda:0"):
    model.to(device).eval()
    losses = []
    predicted_classes = []
    true_classes = []
    with tqdm(total=len(val_dataloader)) as pbar:
        with torch.no_grad():
            for batch in val_dataloader:
                
                input_ids1, attention_mask1, input_ids2, attention_mask2, labels = batch
                input_ids1 = batch['input_ids1'].to(device)
                attention_mask1 = batch['attention_mask1'].to(device)
                input_ids2 = batch['input_ids2'].to(device)
                attention_mask2 = batch['attention_mask2'].to(device)
                labels = batch['labels'].view(-1).to(device)
                
                
                output = model.forward(x_input_ids=input_ids1, x_segment_ids=None, x_input_mask=attention_mask1,
                                   y_input_ids=input_ids2, y_segment_ids=None, y_input_mask=attention_mask2, labels=labels)
                _, predicted = torch.max(output, 1)
            
                loss = criterion(output, labels)
                losses.append(loss.item())
                _, predicted = torch.max(output.detach(), 1)
                predicted_classes.append(predicted)
                true_classes.append(labels)
                
                
                accuracy_mae = accuracy_score(predicted.cpu().numpy(), labels.cpu().numpy())
                pbar.set_description('Loss: {:.4f}; Accuracy_MAE: {:.4f}'.format(loss.item(), accuracy_mae))    
                pbar.update(1)
                
    predicted_classes = torch.cat(predicted_classes).detach().to('cpu').numpy()
    true_classes = torch.cat(true_classes).detach().to('cpu').numpy()
    return losses, predicted_classes, true_classes

def train(model, train_dataloader, val_dataloader, criterion, optimizer, device="cuda:0", n_epochs=10, scheduler=None):
    model.to(device)
    # lrs = []
    for epoch in range(n_epochs):
        print('Learning rate: ', optimizer.param_groups[0]['lr'])
        print('Epoc:', epoch)
        train_one_epoch(model, train_dataloader, criterion, optimizer)
        print('Validation')
        losses, predicted_classes, true_classes = predict(model, val_dataloader, criterion)
        print('Accuracy_MAE: ', accuracy_score(true_classes, predicted_classes))
        # lrs.append(optimizer.param_groups[0]['lr'])
        scheduler.step()

In [16]:
train(fnet_model, fnet_train_dataloader, fnet_eval_dataloader, criterion, fnet_optimizer, device, n_epochs, fnet_scheduler)

Learning rate:  1e-05
Epoc: 0


  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass
Loss: 0.2900; Accuracy: 1.0000: 100%|██████████| 141/141 [00:39<00:00,  3.57it/s]


Validation


Loss: 0.1976; Accuracy_MAE: 0.9286: 100%|██████████| 47/47 [00:04<00:00,  9.97it/s]

Accuracy_MAE:  0.9346666666666666





In [17]:
train(bert_model, bert_train_dataloader, bert_eval_dataloader, criterion, bert_optimizer, device, n_epochs, bert_scheduler)

Learning rate:  1e-05
Epoc: 0


Loss: 0.0152; Accuracy: 1.0000: 100%|██████████| 141/141 [00:56<00:00,  2.49it/s]


Validation


Loss: 0.0085; Accuracy_MAE: 1.0000: 100%|██████████| 47/47 [00:06<00:00,  7.08it/s]

Accuracy_MAE:  0.988





In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np
import faiss

In [19]:
# !pip install transformers
# !pip install -U sentence-transformers
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
big_sentence = pd.read_csv('/content/drive/MyDrive/big_sentence.csv')
sentences = big_sentence['sentence'].sample(1000).unique()

In [21]:
def get_tokens(tokenizer, sentences, max_length=64):
  input_ids = []
  attention_mask = []
  token_type_ids = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = max_length,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_mask.append(encoded_dict['attention_mask'])
      token_type_ids.append(encoded_dict['token_type_ids'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_mask = torch.cat(attention_mask, dim=0)
  token_type_ids = torch.cat(token_type_ids, dim=0)
  return input_ids, attention_mask, token_type_ids


def search_bert(index, model, tokenizer, query, max_length=128, sentences=None):
   t=time.time()
   encoded_dict = tokenizer.encode_plus(
                        query,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
   model.eval()
   with torch.no_grad():
     output = model(x_input_ids=encoded_dict['input_ids'], x_input_mask=encoded_dict['attention_mask'], 
                    x_segment_ids=encoded_dict['token_type_ids'], train_sbert=False)


   query_vector = output.cpu().numpy()
   k = 5
   top_k = index.search(query_vector, k)
   print('totaltime: {}'.format(time.time()-t))
   return [sentences[_id] for _id in top_k[1].tolist()[0]], query_vector

def final_result(index=None, model=None, tokenizer=None, max_length=128, sentences=None):
  query = str(input())
  find_sentence, query_vector = search_bert(index=index, model=model, tokenizer=tokenizer, query=query, max_length=max_length, sentences=sentences)
  input_ids, attention_mask, token_type_ids = get_tokens(tokenizer=tokenizer, sentences=find_sentence, max_length=64)
  model.eval()
  with torch.no_grad():
    sent_b = model(x_input_ids=input_ids, x_input_mask=attention_mask, x_segment_ids=token_type_ids, train_sbert=False)
  cos_simil = cosine_similarity(query_vector, sent_b)[0]
  print('results :')
  for i in range(len(cos_simil)):
    print('\t','Cosine Similarity: ' + str(cos_simil[i]) + '  ' +str(find_sentence[i]))

In [22]:
input_ids, attention_mask, token_type_ids = get_tokens(tokenizer=bert_tokenizer, sentences=sentences, max_length=64)
input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
bert_model.eval()
with torch.no_grad():
  output = bert_model(x_input_ids=input_ids, x_input_mask=attention_mask, x_segment_ids=token_type_ids, train_sbert=False)


encoded_data = output.cpu().numpy()
bert_index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
bert_index.add_with_ids(encoded_data, np.array(range(0, len(sentences))))
faiss.write_index(bert_index, 'search')


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
final_result(index=bert_index, model=bert_model.cpu(), tokenizer=bert_tokenizer, max_length=128, sentences=sentences)

cancer




totaltime: 0.43050575256347656
results :
	 Cosine Similarity: 0.7884204  Recent clinical experience has provided evidence that conservative management and early prophylactic antibiotic administration in sterile necrotising pancreatitis is the treatment of choice [17,21,25,31].
	 Cosine Similarity: 0.775641  Three studies examined the abilities of both devices to record objective data such as burn variables[35], daily food intake [40]and intravenous infusions of hemophilic clotting factor concentrates[32].
	 Cosine Similarity: 0.78065777  Correction for gas compressibility as well as resistive and accelerative losses in the flexiVent, connecting tubing and the tracheal cannula were performed as described previously [38]using dynamic calibration data obtained by applying volume perturbations through the tubing and tracheal cannula first when it was completely closed and then when it was open to the atmosphere.
	 Cosine Similarity: 0.77311754  Very recently, the application of gamma detec

In [24]:
input_ids, attention_mask, token_type_ids = get_tokens(tokenizer=fnet_tokenizer, sentences=sentences, max_length=64)
input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
fnet_model.eval()
with torch.no_grad():
  output = fnet_model(x_input_ids=input_ids, x_input_mask=attention_mask, x_segment_ids=token_type_ids, train_sbert=False)


encoded_data = output.cpu().numpy()
fnet_index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
fnet_index.add_with_ids(encoded_data, np.array(range(0, len(sentences))))
faiss.write_index(fnet_index, 'search')


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [25]:
final_result(index=fnet_index, model=fnet_model.cpu(), tokenizer=fnet_tokenizer, max_length=128, sentences=sentences)

cancer




totaltime: 0.3511521816253662
results :
	 Cosine Similarity: 0.6412494  The scientific literature has also investigated secondary prevention as a strategy to reduce disability from LBP because effective primary prevention strategies are currently lacking [11].
	 Cosine Similarity: 0.63241494  The smallest number of participants to detect this difference between two proportions estimated from independent samples is 65 participants per group, ie, 130 participants in total [23].
	 Cosine Similarity: 0.6219418  One of its particular mandated projects is a coordinated marketing and advertising strategy to attract medical graduates, including overseas-trained doctors, for employment in Queensland [16].
	 Cosine Similarity: 0.63591504  In order to obtain functional profiles of the differentially regulated genes, the three gene clusters identified above were subjected to Gene Ontology analysis using the GOAT programme [33].
	 Cosine Similarity: 0.63904876  Each volunteer was allowed to visit t