In [1]:
import datasets

oscar = datasets.load_dataset('oscar', 'unshuffled_deduplicated_en', split='train', streaming=True)

ImportError: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

In [2]:
for row in oscar:
    print(row)
    break

{'id': 0, 'text': 'Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi. Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help.\nEstablished in honor of John & Lindy’s son, Christopher Blanchard, this particular program is very dear to the Blanchard family. Dana Blanchard, or Mama Dana as she is more commonly referred to at Mtendere, lived on site during the initial development, and she returns each summer to spend the season with her Malawian family. The heart of the program is to be His hands and feet by caring for the children at Mtendere, and meeting their spiritual, physical, academic, and emotional needs.\nMtendere Village is home to 134 children, living in 16 homes with a housemother and several brothers and sisters. This family environment is one that many of the children have never pre

In [3]:
import re

splitter = re.compile(r'\.\s?\n?')

In [4]:
splitter.split(row['text'])

['Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi',
 'Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help',
 'Established in honor of John & Lindy’s son, Christopher Blanchard, this particular program is very dear to the Blanchard family',
 'Dana Blanchard, or Mama Dana as she is more commonly referred to at Mtendere, lived on site during the initial development, and she returns each summer to spend the season with her Malawian family',
 'The heart of the program is to be His hands and feet by caring for the children at Mtendere, and meeting their spiritual, physical, academic, and emotional needs',
 'Mtendere Village is home to 134 children, living in 16 homes with a housemother and several brothers and sisters',
 'This family environment is one that many of the children have never prev

In [5]:
sentences = []
for row in oscar:
    new_sentences = splitter.split(row['text'])
    new_sentences = [ele for ele in new_sentences if len(ele) > 10]
    sentences.extend(new_sentences)
    if len(sentences) > 100_000:
        break
print(len(sentences))    

100041


In [6]:
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from torch.utils.data import DataLoader

dataset = DenoisingAutoEncoderDataset(sentences)
loader = DataLoader(dataset, batch_size=8, shuffle=True, drop_last=True)
len(loader)

12505

In [7]:
from sentence_transformers import SentenceTransformer, models

bert = models.Transformer('bert-base-uncased')
pooling = models.Pooling(bert.get_word_embedding_dimension(), 'cls')

model = SentenceTransformer(modules=[bert, pooling])
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [8]:
from sentence_transformers.losses import DenoisingAutoEncoderLoss

loss = DenoisingAutoEncoderLoss(model, tie_encoder_decoder=True)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [11]:
model.fit(
    train_objectives=[(loader, loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

model.save('output/tsdae-bert-base-uncased')

In [38]:
model = SentenceTransformer('output/tsdae-bert-base-uncased')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [39]:
sts = datasets.load_dataset('glue', 'stsb', split='validation')
sts

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [40]:
sts = sts.map(lambda x: {'label':x['label']/5.0})

In [41]:
from sentence_transformers import InputExample

samples = []
for row in sts:
    samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=row['label']))
len(samples)

1500

In [42]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

eval = EmbeddingSimilarityEvaluator.from_input_examples(samples, write_csv=False) 

In [43]:
eval(model)

0.7355641517487856

In [44]:
bert = models.Transformer('bert-base-uncased')
pooling = models.Pooling(bert.get_word_embedding_dimension(), 'cls')

model = SentenceTransformer(modules=[bert, pooling])
eval(model)

0.3173615247822984

In [47]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
eval(model)

0.807870792395701

In [49]:
model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')
eval(model)

0.8883451646579028