In [1]:
# !pip uninstall -y transformers accelerate sentence-transformers
# !pip uninstall -y peft
# !pip uninstall -y bitsandbytes trl

# !pip install --no-cache-dir --force-reinstall \
#   transformers==4.41.2 \
#   accelerate==0.29.3 \
#   sentence-transformers==3.1.0 \
#   datasets==2.20.0 \
#   mteb==1.1.2

In [2]:
import accelerate

print("accelerate:", accelerate.__version__)

import transformers

print("transformers:", transformers.__version__)

import sentence_transformers

print("sentence-transformers:", sentence_transformers.__version__)

accelerate: 0.29.3
transformers: 4.41.2
sentence-transformers: 3.1.0


## Unsupervised learning


### Transformer based denoising autoencoder (TSDAE)


In [3]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from datasets import Dataset, load_dataset

# Create a list of flat list of sentences
mnli = load_dataset('glue', 'mnli', split='train').select(range(25_000))
flat_sentences = list(mnli['premise']) + list(mnli['hypothesis'])

# Add noise to our input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# Create dataset
train_dataset = {
    'damaged_sentence': [],
    'original_sentence': []
}

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [5]:
from tqdm import tqdm

for data in tqdm(damaged_data):
    train_dataset['damaged_sentence'].append(data.texts[0])
    train_dataset['original_sentence'].append(data.texts[1])
    
train_dataset = Dataset.from_dict(train_dataset)

100%|██████████| 48353/48353 [00:09<00:00, 5228.68it/s]


In [6]:
train_dataset[0]

{'damaged_sentence': 'know when i and the thirties',
 'original_sentence': "eighties and you know when i left here and we're down to the thirties and twenties and"}

In [7]:
# Choose a different deletion ratio

# flat_sentences = list(set(flat_sentences))
# damaged_data = DenoisingAutoEncoderDataset(
#     flat_sentences, 
#     noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s, del_ratio=0.6)
# )

In [8]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2= val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']]
)

Downloading data:   0%|          | 0.00/502k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [9]:
from sentence_transformers import models
from sentence_transformers import SentenceTransformer

word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
from sentence_transformers import losses

train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model, tie_encoder_decoder=True
)

train_loss.decoder = train_loss.decoder.to('cuda')

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [11]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir='tsdae_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

In [12]:
from sentence_transformers.trainer import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makshaysoam8[0m ([33makshaysoam8-dan-com-a-godaddy-brand[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,6.782
200,4.8804
300,4.6767
400,4.5833
500,4.4609
600,4.4357
700,4.3673
800,4.3825
900,4.3102
1000,4.2263


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=3023, training_loss=4.231323595517051, metrics={'train_runtime': 956.5112, 'train_samples_per_second': 50.551, 'train_steps_per_second': 3.16, 'total_flos': 0.0, 'train_loss': 4.231323595517051, 'epoch': 1.0})

In [13]:
evaluator(embedding_model)

{'pearson_cosine': 0.7026667084662546,
 'spearman_cosine': 0.7177364188051075,
 'pearson_manhattan': 0.7095405154277624,
 'spearman_manhattan': 0.7131577074602364,
 'pearson_euclidean': 0.7089718663135827,
 'spearman_euclidean': 0.7126395273873325,
 'pearson_dot': 0.5810849457875915,
 'spearman_dot': 0.578102203738832,
 'pearson_max': 0.7095405154277624,
 'spearman_max': 0.7177364188051075}

In [14]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()