In [81]:
import os
import sys
os.environ["MODEL_DIR"] = '../model'
from pathlib import Path
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm.notebook import tqdm
import nlpaug.augmenter.word as naw
import numpy as np
from simcse import SimCSE
import torch

In [82]:
!ls ../data/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
csv		    wiki1m_for_simcse.txt  wiki_delete_one_word.csv
download_nli.sh     wiki_cropping_0.1.csv  wiki_synonym_replacement.csv
download_wiki.sh    wiki_cropping_0.1.txt  wiki_word_deletion_0.1.csv
nli_for_simcse.csv  wiki_cropping_0.2.csv  wiki_word_deletion_0.2.csv
text		    wiki_cropping_0.3.csv  wiki_word_deletion_0.3.csv


In [112]:
data_folder = Path('../data/').resolve()
data_files = {
    "train": str(data_folder / "wiki1m_for_simcse.txt")
}
extension = "text"

wiki_dataset = load_dataset(extension, data_files=data_files, cache_dir="../data/")



  0%|          | 0/1 [00:00<?, ?it/s]

In [113]:
aug = naw.SynonymAug(aug_src='wordnet')
rnd = np.random.default_rng(42)


def crop_sentence(sentence, fraction=0.1):
    return sentence[:int((1-fraction)*len(sentence))]

def delete_words(sentence, fraction=0.1):
    words = sentence.split(" ")
    n_words = len(words)
    n_words_to_keep = int((1 - fraction)*len(words))
    ids = sorted(rnd.choice(range(n_words), n_words_to_keep, replace=False))
    return " ".join([words[i] for i in ids])

def delete_one_word(sentence):
    words = sentence.split(" ")
    idx_delete = rnd.integers(len(words))
    del words[idx_delete]
    return ' '.join(words)

def replace_a_word_with_synonym(sentence):
    return aug.augment(sentence)
    
for augmentation, kwargs in [
    ("cropping", {"fraction": 0.1}),
    ("cropping", {"fraction": 0.2}),
    ("cropping", {"fraction": 0.3}),
#     ("word_deletion", {"fraction": 0.1}),
#     ("word_deletion", {"fraction": 0.2}),
#     ("word_deletion", {"fraction": 0.3}),
#     ("delete_one_word", {}),
#     ("synonym_replacement", {})
]:
    sent1 = []
    sent2 = []
    for idx, sentence in enumerate(tqdm(wiki_dataset['train'], desc=f'{augmentation}')):
        if augmentation == "cropping":
            new_sentence = crop_sentence(sentence['text'], **kwargs)
        elif augmentation == "word_deletion":
            new_sentence = delete_words(sentence['text'], **kwargs)
        elif augmentation == "delete_one_word":
            new_sentence = delete_one_word(sentence['text'], **kwargs)
        elif augmentation == "synonym_replacement":
            new_sentence = replace_a_word_with_synonym(sentence['text'], **kwargs)[0]
        else:
            raise NotImplementedError(f"Unknown augmentation: {augmentation}")
        sent1.append(sentence['text'])
        sent2.append(new_sentence)
    new_dataset = Dataset.from_dict({'sent1': sent1, 'sent2': sent2})
    suffix = f'_{kwargs["fraction"]}' if "fraction" in kwargs else ""
    new_dataset.to_csv(data_folder / f'wiki_{augmentation}{suffix}.csv', index=False)

cropping:   0%|          | 0/1000000 [00:00<?, ?it/s]

Creating CSV from Arrow format:   0%|          | 0/1000 [00:00<?, ?ba/s]

cropping:   0%|          | 0/1000000 [00:00<?, ?it/s]

Creating CSV from Arrow format:   0%|          | 0/1000 [00:00<?, ?ba/s]

cropping:   0%|          | 0/1000000 [00:00<?, ?it/s]

Creating CSV from Arrow format:   0%|          | 0/1000 [00:00<?, ?ba/s]

In [121]:
data_folder = Path('../data/').resolve()
data_files = {
    "train": str(data_folder / "wiki_cropping_0.1.csv")
}
extension = "csv"

wiki_dataset = load_dataset(extension, data_files=data_files, cache_dir="../data/")



  0%|          | 0/1 [00:00<?, ?it/s]

## Alignment and uniformity

In [15]:
from senteval.sts import STSBenchmarkEval

In [68]:
sts_b_path = Path("../SentEval/data/downstream/STS/STSBenchmark/")
sts_b_dataset = STSBenchmarkEval(sts_b_path)
sufficiently_close_sent1 = []
sufficiently_close_sent2 = []
all_sentences_1 = []
all_sentences_2 = []
gs_scores = []
for dataset in ['train', 'dev', 'test']:
    for sent1, sent2, score in zip(*sts_b_dataset.data[dataset]):
        if score > 4:
            sufficiently_close_sent1.append(" ".join(sent1))
            sufficiently_close_sent2.append(" ".join(sent2))
        all_sentences_1.append(" ".join(sent1))
        all_sentences_2.append(" ".join(sent2))
        gs_scores.append(score)

In [69]:
# def aligment(embeddings_1, embeddings_2): 
#     return torch.mean(torch.sum(torch.square(embeddings_1 - embeddings_2), dim=-1)).item()

# def uniformity(embeddings_1, embeddings_2):
#     return torch.log(torch.mean(torch.exp(-2*torch.sum(torch.square(embeddings_1 - embeddings_2), dim=-1)))).item()

def _norm(x, eps=1e-8): 
    xnorm = torch.linalg.norm(x, dim=-1)
    xnorm = torch.max(xnorm, torch.ones_like(xnorm) * eps)
    return x / xnorm.unsqueeze(dim=-1)

# from Wang and Isola (with a bit of modification)
# only consider pairs with gs > 4 (from footnote 3)
def _lalign(x, y, ok, alpha=2):
    return ((_norm(x) - _norm(y)).norm(dim=1).pow(alpha) * ok).sum() / ok.sum()

def _lunif(x, t=2):
    sq_pdist = torch.pdist(_norm(x), p=2).pow(2)
    return sq_pdist.mul(-t).exp().mean().log()

In [60]:
!ls ../result/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
delete_one_word			   wiki_cropping_0.1  wiki_word_deletion_0.1
my-unsup-simcse-bert-base-uncased  wiki_cropping_0.2  wiki_word_deletion_0.2
synonym_replacement		   wiki_cropping_0.3  wiki_word_deletion_0.3


In [110]:
results = {}
for model_name in [
    "unsup_simcse",
    "delete_one_word",
    "synonym_replacement",
    "wiki_cropping_0.1",
    "wiki_cropping_0.2",
    "wiki_cropping_0.3",
    "wiki_word_deletion_0.1",
    "wiki_word_deletion_0.2",
    "wiki_word_deletion_0.3"
]:
    model = SimCSE(str(Path("../result") / model_name))
    all_embeddings_1 = model.encode(all_sentences_1)
    all_embeddings_2 = model.encode(all_sentences_2)
    
    ok = (torch.Tensor(gs_scores) > 4).int()
    align = _lalign(
        all_embeddings_1, 
        all_embeddings_2, 
        ok).item()

    # consider all sentences (from footnote 3)
    unif = _lunif(torch.vstack([all_embeddings_1, all_embeddings_2])).item()
    results[model_name] = (align, unif)
    print(f'align {align}\t\t uniform {unif}')

Some weights of BertModel were not initialized from the model checkpoint at ../result/unsup_simcse and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
03/03/2023 17:35:01 - INFO - simcse.tool -   Use `cls_before_pooler` for unsupervised models. If you want to use other pooling policy, specify `pooler` argument.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.82it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 18.19it/s]


align 0.19683097302913666		 uniform -2.444230318069458


Some weights of BertModel were not initialized from the model checkpoint at ../result/delete_one_word and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.71it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 18.12it/s]


align 0.21235987544059753		 uniform -2.378030300140381


Some weights of BertModel were not initialized from the model checkpoint at ../result/synonym_replacement and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.66it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 18.03it/s]


align 0.18112090229988098		 uniform -2.0485191345214844


Some weights of BertModel were not initialized from the model checkpoint at ../result/wiki_cropping_0.1 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.57it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.90it/s]


align 0.2761279344558716		 uniform -2.503908634185791


Some weights of BertModel were not initialized from the model checkpoint at ../result/wiki_cropping_0.2 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.51it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.88it/s]


align 0.2948911190032959		 uniform -2.515211582183838


Some weights of BertModel were not initialized from the model checkpoint at ../result/wiki_cropping_0.3 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.50it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.78it/s]


align 0.31880515813827515		 uniform -2.5550127029418945


Some weights of BertModel were not initialized from the model checkpoint at ../result/wiki_word_deletion_0.1 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.35it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.61it/s]


align 0.20204579830169678		 uniform -2.447098731994629


Some weights of BertModel were not initialized from the model checkpoint at ../result/wiki_word_deletion_0.2 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.21it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.06it/s]


align 0.18055179715156555		 uniform -2.2859609127044678


Some weights of BertModel were not initialized from the model checkpoint at ../result/wiki_word_deletion_0.3 and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.06it/s]
100%|███████████████████████████████████████████████████████████████████████| 135/135 [00:07<00:00, 17.32it/s]


align 0.17033952474594116		 uniform -2.0445778369903564


In [50]:
uniformity(all_embeddings_1, all_embeddings_2)

-0.7785198092460632