In [1]:
# !pip install -q sentence-transformers datasets

In [2]:
import sentence_transformers

In [None]:
import datasets
oscar = datasets.load_dataset('oscar', 'unshuffled_deduplicated_en', streaming=True, split='train')
oscar

In [4]:
for row in oscar:
  break
row

{'id': 0,
 'text': 'Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi. Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help.\nEstablished in honor of John & Lindy’s son, Christopher Blanchard, this particular program is very dear to the Blanchard family. Dana Blanchard, or Mama Dana as she is more commonly referred to at Mtendere, lived on site during the initial development, and she returns each summer to spend the season with her Malawian family. The heart of the program is to be His hands and feet by caring for the children at Mtendere, and meeting their spiritual, physical, academic, and emotional needs.\nMtendere Village is home to 134 children, living in 16 homes with a housemother and several brothers and sisters. This family environment is one that many of the children have never pr

In [5]:
import re
spliter = re.compile(r'\.\s?\n?')

In [6]:
spliter.split(row['text'])

['Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi',
 'Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help',
 'Established in honor of John & Lindy’s son, Christopher Blanchard, this particular program is very dear to the Blanchard family',
 'Dana Blanchard, or Mama Dana as she is more commonly referred to at Mtendere, lived on site during the initial development, and she returns each summer to spend the season with her Malawian family',
 'The heart of the program is to be His hands and feet by caring for the children at Mtendere, and meeting their spiritual, physical, academic, and emotional needs',
 'Mtendere Village is home to 134 children, living in 16 homes with a housemother and several brothers and sisters',
 'This family environment is one that many of the children have never prev

In [7]:
sentences = []
for row in oscar:
  s = spliter.split(row['text'])
  s = [ele for ele in s if ele and len(ele) > 10]
  sentences.extend(s)
  if len(sentences) > 10_000:
    break
len(sentences)

10037

In [8]:
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from torch.utils.data import DataLoader

train_data = DenoisingAutoEncoderDataset(sentences)
loader = DataLoader(train_data, batch_size=2, shuffle=True, drop_last=True)
len(loader)

5018

In [9]:
from sentence_transformers import models, SentenceTransformer
bert = models.Transformer('bert-base-uncased')
pool = models.Pooling(bert.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[bert, pool])
model



SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [10]:
from sentence_transformers.losses import DenoisingAutoEncoderLoss
loss_func = DenoisingAutoEncoderLoss(model, tie_encoder_decoder=True)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
model.fit(
    train_objectives=[(loader, loss_func)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

In [14]:
model.save('output/tsdae-bert-base-uncased')

In [15]:
stsb = datasets.load_dataset('glue', 'stsb', split='validation')
stsb = stsb.map(lambda x: {'label': x['label']/5.0})
stsb

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/502k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [16]:
from sentence_transformers import InputExample
val_samples = []
for row in stsb:
  val_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=row['label']))
len(val_samples)

1500

In [17]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
eval = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

In [18]:
model = SentenceTransformer('output/tsdae-bert-base-uncased')

In [19]:
%%time
eval(model)

CPU times: user 5 s, sys: 72.8 ms, total: 5.07 s
Wall time: 5.25 s


0.45509176068289825

In [20]:
from sentence_transformers import models, SentenceTransformer
bert = models.Transformer('bert-base-uncased')
pool = models.Pooling(bert.get_word_embedding_dimension(), 'cls')
bert_model = SentenceTransformer(modules=[bert, pool])
bert_model



SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [21]:
%%time
eval(bert_model)

CPU times: user 4.62 s, sys: 51.3 ms, total: 4.67 s
Wall time: 4.53 s


0.3173606997231506

In [None]:
bert_nli_model = SentenceTransformer('bert-base-nli-mean-tokens')
bert_nli_model

In [23]:
%%time
eval(bert_nli_model)

CPU times: user 4.6 s, sys: 64.7 ms, total: 4.67 s
Wall time: 4.65 s


0.8078718290448562

In [None]:
mpnet_model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')
mpnet_model

In [25]:
%%time
eval(mpnet_model)

CPU times: user 4.24 s, sys: 78 ms, total: 4.31 s
Wall time: 5.89 s


0.8883451646579028