In [1]:
!pip install -q sentence-transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import datasets

snli = datasets.load_dataset('snli', split='train')
mnli = datasets.load_dataset('glue', 'mnli', split='train')
mnli = mnli.remove_columns(['idx'])
dataset = datasets.concatenate_datasets([snli, mnli])
dataset = dataset.filter(lambda x: True if x['label']==0 else False)
del snli, mnli
dataset

In [3]:
from transformers import AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = dataset.map(lambda x: bert_tokenizer(x['premise'], padding='max_length', max_length=128, truncation=True), batched=True)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/314315 [00:00<?, ? examples/s]

In [4]:
dataset = dataset.rename_column('attention_mask', 'anchor_mask')
dataset = dataset.rename_column('input_ids', 'anchor_ids')

In [5]:
dataset = dataset.map(lambda x: bert_tokenizer(x['hypothesis'], padding='max_length', max_length=128, truncation=True), batched=True)
dataset = dataset.rename_column('attention_mask', 'positive_mask')
dataset = dataset.rename_column('input_ids', 'positive_ids')

Map:   0%|          | 0/314315 [00:00<?, ? examples/s]

In [6]:
dataset = dataset.remove_columns(['premise', 'hypothesis', 'label', 'token_type_ids'])
dataset

Dataset({
    features: ['anchor_ids', 'anchor_mask', 'positive_ids', 'positive_mask'],
    num_rows: 314315
})

In [7]:
dataset.set_format(type='torch', output_all_columns=True)

In [8]:
import torch
from torch.utils.data import DataLoader

batch_size = 2
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
len(loader)

157158

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cpu


In [10]:
from transformers import AutoModel
model = AutoModel.from_pretrained('bert-base-uncased').to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
cos_sim = torch.nn.CosineSimilarity().to(device)
loss_func = torch.nn.CrossEntropyLoss().to(device)
scale = 20.0

In [12]:
def mean_pooling(input_ids, attention_mask):
  attention_mask = attention_mask.unsqueeze(-1).expand(input_ids.size()).float()
  return (input_ids*attention_mask).sum(1) / attention_mask.sum(1).clamp(min=1e-9)

In [13]:
from transformers import get_linear_schedule_with_warmup

epochs = 1
total_steps = int(len(loader) * epochs)
warmup_steps = int(total_steps * 0.1)

opt = torch.optim.Adam(model.parameters(), lr=2e-5)
sched = get_linear_schedule_with_warmup(opt, num_warmup_steps=warmup_steps, num_training_steps=total_steps-warmup_steps)

In [14]:
from tqdm import tqdm

for epoch in range(epochs):
  model.train()
  loop = tqdm(loader)
  for xb in loop:
    opt.zero_grad()
    anchor_ids = xb['anchor_ids'].to(device)
    anchor_mask = xb['anchor_mask'].to(device)
    pos_ids = xb['positive_ids'].to(device)
    pos_mask = xb['positive_mask'].to(device)

    a = model(anchor_ids, attention_mask=anchor_mask)[0]
    p = model(pos_ids, attention_mask=pos_mask)[0]

    a = mean_pooling(a, anchor_mask)
    p = mean_pooling(p, pos_mask)

    scores = torch.stack([cos_sim(ai[None], p) for ai in a])
    labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device)

    loss = loss_func(scores*scale,labels)
    loss.backward()
    opt.step()
    sched.step()

    loop.set_description(f"epoch:{epoch}")
    loop.set_postfix(loss=f"{loss.item():.4f}")
    break


epoch:0:   0%|          | 0/157158 [00:16<?, ?it/s, loss=0.3684]


In [16]:
import os
model_path = './sbert_test_a'
if not os.path.exists(model_path):
  os.mkdir(model_path)

model.save_pretrained(model_path)

In [18]:
import datasets

snli = datasets.load_dataset('snli', split='train')
mnli = datasets.load_dataset('glue', 'mnli', split='train')
mnli = mnli.remove_columns(['idx'])
dataset = datasets.concatenate_datasets([snli, mnli])
dataset = dataset.filter(lambda x: True if x['label']==0 else False)
del snli, mnli
dataset

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 314315
})

In [20]:
from sentence_transformers import InputExample
from tqdm import tqdm
train_samples = []
for row in tqdm(dataset):
  train_samples.append(InputExample(texts=[row['premise'], row['hypothesis']]))
len(train_samples)

100%|██████████| 314315/314315 [01:11<00:00, 4368.63it/s]


314315

In [21]:
from sentence_transformers import datasets
batch_size = 2
loader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=batch_size)
len(loader)

157157

In [22]:
from sentence_transformers import models, SentenceTransformer
bert_model = models.Transformer('bert-base-uncased')
pool_layer = models.Pooling(bert_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
sbert_model = SentenceTransformer(modules=[bert_model, pool_layer])
sbert_model



SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [24]:
from sentence_transformers import losses
loss_func = losses.MultipleNegativesRankingLoss(sbert_model)

In [None]:
epochs = 1
warmup_steps = int(len(loader)*epochs*0.1)
sbert_model.fit(
    train_objectives = [(loader, loss_func)],
    epochs = epochs,
    warmup_steps = warmup_steps,
    output_path = './sbert_test_b',
    show_progress_bar = True
)

In [28]:
sbert_model.save('./sbert_test_b')

In [61]:
from sentence_transformers import models, SentenceTransformer

model_2 = SentenceTransformer('./sbert_test_b')
model_2

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [None]:
from transformers import AutoModel, AutoTokenizer
model_1 = AutoModel.from_pretrained('./sbert_test_a')
model_1

In [52]:
import datasets

sts = datasets.load_dataset('glue', 'stsb', split='validation')
sts = sts.map(lambda x: {'label': x['label']/5.0})
sts

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [54]:
max(sts['label'])

1.0

In [55]:
from sentence_transformers import InputExample
val_samples = []
for row in sts:
  val_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=row['label']))
len(val_samples)

1500

In [57]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
eval = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, write_csv=True)

In [None]:
eval(model_2)