<a href="https://colab.research.google.com/github/alanmatys/pytorch_examples/blob/dev/TP_NLP_Alan_Matys_STS_SemanticSim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets spacy fasttext transformers seqeval ipdb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [None]:
# Cargamos datos de STSb
from datasets import load_dataset

dataset = load_dataset("stsb_multi_mt","es")

dataset

Downloading builder script:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.98k [00:00<?, ?B/s]

Downloading and preparing dataset stsb_multi_mt/es to /root/.cache/huggingface/datasets/stsb_multi_mt/es/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/57.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Dataset stsb_multi_mt downloaded and prepared to /root/.cache/huggingface/datasets/stsb_multi_mt/es/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})

In [None]:
from transformers import AutoTokenizer

model_name = 'alanmatys/beto-nliudesa'

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(example):
    tokenization1 = tokenizer(example["sentence1"], truncation=True, padding=False)
    tokenization2 = tokenizer(example['sentence2'], truncation=True, padding=False)

    tokenization1 = {f"sent1_{k}":v for k,v in tokenization1.items()}
    tokenization2 = {f"sent2_{k}":v for k, v in tokenization2.items()}

    tokenization = tokenization1.copy()
    tokenization.update(tokenization2)
    return tokenization

tokenized_dataset = dataset.map(tokenize, batched=True)



In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score', 'sent1_input_ids', 'sent1_token_type_ids', 'sent1_attention_mask', 'sent2_input_ids', 'sent2_token_type_ids', 'sent2_attention_mask'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score', 'sent1_input_ids', 'sent1_token_type_ids', 'sent1_attention_mask', 'sent2_input_ids', 'sent2_token_type_ids', 'sent2_attention_mask'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score', 'sent1_input_ids', 'sent1_token_type_ids', 'sent1_attention_mask', 'sent2_input_ids', 'sent2_token_type_ids', 'sent2_attention_mask'],
        num_rows: 1500
    })
})

In [None]:
from torch.utils.data import DataLoader
import torch

def collate_batch(batch):
    new_batch = {}
    for example in batch:
        for k, v in example.items():
            if k not in new_batch:
                new_batch[k] = []
            new_batch[k].append(v)
    batch = new_batch

    labels = batch.pop("similarity_score")

    premise_inputs = tokenizer.pad(
        {
            "input_ids": batch['sent1_input_ids'],
            "token_type_ids": batch['sent1_token_type_ids'],
            "attention_mask": batch['sent1_attention_mask'],
        },
        return_tensors="pt"
    )

    hyp_inputs = tokenizer.pad(
        {
            "input_ids": batch['sent2_input_ids'],
            "token_type_ids": batch['sent2_token_type_ids'],
            "attention_mask": batch['sent2_attention_mask'],
        },
        return_tensors="pt"
    )

    #premise_output = {f"premise_{k}":v for k,v in premise_inputs.items()}
    #hyp_output = {f"hyp_{k}":v for k, v in hyp_inputs.items()}

    labels = torch.LongTensor(labels)

    #inputs = premise_output.copy()
    #inputs.update(hyp_output)

    return premise_inputs, hyp_inputs, labels

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, collate_fn=collate_batch,
                              pin_memory=True, num_workers=4)
test_dataloader = DataLoader(tokenized_dataset["test"], batch_size=16, collate_fn=collate_batch)
dev_dataloader = DataLoader(tokenized_dataset["dev"], batch_size=16, collate_fn=collate_batch)

In [None]:
import torch.nn as nn
import torch
class SBETOSTS(nn.Module):
    def __init__(self, base_model):
      super().__init__()
      self.bert = base_model
      self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    def forward(self, sent1_inputs, sent2_inputs):
      sent1 = self.bert(**sent1_inputs)
      sent1_pool = sent1.pooler_output

      sent2 = self.bert(**sent2_inputs)
      sent2_pool = sent2.pooler_output

      output = self.cos(sent1_pool,sent2_pool)

      return output

## Modelo FT

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz && gunzip cc.es.300.bin.gz

--2023-06-16 01:01:34--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.8.29, 13.35.8.19, 13.35.8.35, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.8.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4500107671 (4.2G) [application/octet-stream]
Saving to: ‘cc.es.300.bin.gz’


2023-06-16 01:02:02 (151 MB/s) - ‘cc.es.300.bin.gz’ saved [4500107671/4500107671]



In [None]:
import fasttext
# Load the pre-trained FastText model
ft_model = fasttext.load_model("cc.es.300.bin")



In [None]:
from tqdm.auto import tqdm

def ft_collate_batch(batch):
    new_batch = {}
    for example in batch:
        for k, v in example.items():
            if k not in new_batch:
                new_batch[k] = []
            new_batch[k].append(v)
    batch = new_batch

    labels = batch.pop("similarity_score")


    labels = torch.LongTensor(labels)


    return batch, labels

ft_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32,collate_fn=ft_collate_batch,
                              pin_memory=True, num_workers=4)




In [None]:
from scipy.stats import spearmanr
def ft_spearman(model,ft_dataloader):
  """

  """
  similarity = nn.CosineSimilarity(dim=1, eps=1e-3)

  list_similarities = []
  list_labels = []


  for batch, labels in tqdm(ft_dataloader):
    sentence1 = batch["sentence1"]
    sentence2 = batch["sentence2"]

    sentence1_embedding = [model.get_sentence_vector(x) for x in sentence1]
    sentence2_embedding = [model.get_sentence_vector(x) for x in sentence2]


    sim = similarity(torch.tensor(sentence1_embedding), torch.tensor(sentence2_embedding))

    list_similarities.append(sim)
    list_labels.append(labels)


  preds = torch.cat(list_similarities).cpu().numpy()
  labels = torch.cat(list_labels).cpu().numpy()

  return spearmanr(preds,labels)

ft_spearman(ft_model, ft_dataloader)



  0%|          | 0/180 [00:01<?, ?it/s]

SignificanceResult(statistic=0.5082101461231864, pvalue=0.0)

In [None]:
del ft_model, ft_dataloader

## Corrida con BETO

In [None]:
from transformers import AutoModel
from tqdm.auto import tqdm
from scipy.stats import spearmanr

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = AutoModel.from_pretrained(model_name)
model = SBETOSTS(base_model)
model = model.to(device)

def beto_spearman(model, dataloader):
  """

  """
  all_labels = []
  all_outputs = []
  for batch in tqdm(dataloader):
        prem_inputs, hyp_inputs, label = batch
        if device:
            prem_inputs = {k:v.to(device) for k, v in prem_inputs.items()}
            hyp_inputs = {k:v.to(device) for k, v in hyp_inputs.items()}
            label = label.to(device)

        outs = model.forward(prem_inputs, hyp_inputs)
        all_labels.append(label.detach().cpu())
        all_outputs.append(outs.detach().cpu())

  preds = torch.cat(all_outputs).cpu().numpy()
  gold = torch.cat(all_labels).cpu().numpy()
  return spearmanr(gold, preds)

beto_spearman(model, train_dataloader)

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/180 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


SignificanceResult(statistic=0.60389520227956, pvalue=0.0)

## BETO comun

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model2 = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
model2 = SBETOSTS(base_model2)
model2 = model2.to(device)

beto_spearman(model2, train_dataloader)

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.we

  0%|          | 0/180 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `_

SignificanceResult(statistic=0.3187432685110718, pvalue=6.564351111760902e-136)