<a href="https://colab.research.google.com/github/cateto/python4NLP/blob/main/sbert/sentence_vector_%EB%AA%A8%EB%8D%B8%EC%B8%A1%EC%A0%95_after_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets sentence_transformers

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.2 MB/s 
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 6.9 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 31.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 38.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.6 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 49.8 MB/s 
Collecting transformers<

In [3]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

model_name = "klue/roberta-base"

train_batch_size = 32

num_epochs = 4

model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

datasets = load_dataset("klue", "sts")

datasets["train"][0]

testsets = load_dataset("kor_nlu", "sts")

testsets["test"][0]

# 두 데이터 모두 0점에서 5점 사이의 값으로 유사도가 기록되었기 때문에, 0.0 ~ 1.0 스케일로 정규화
train_samples = []
dev_samples = []
test_samples = []

#KLUE STS 훈련, 검증 데이터 예제 변환
for phase in ["train", "validation"]:
    examples = datasets[phase]
    
    for example in examples:
        score = float(example["labels"]["label"]/5.0) #0.0~1.0 사이로 정규화
        
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )
        
        if phase == "validation":
            dev_samples.append(inp_example)
        else:
            train_samples.append(inp_example)
# KorSTS 내에서 테스트 데이터 예제 변환

for example in testsets["test"]:
    score = float(example["score"]) / 5.0
    
    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score
        )
    test_samples.append(inp_example)
                
            
train_samples[0].texts, train_samples[0].label
test_samples[0].texts, test_samples[0].label
dev_samples[0].texts, dev_samples[0].label
# DataLoader 과 Loss 설정하기
train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=train_batch_size
)

embedding_model = models.Transformer(model_name)

pooler = models.Pooling(
embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False
)

model = SentenceTransformer(modules=[embedding_model, pooler])

train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples,
    name="sts-dev"
)
warmup_steps = math.ceil(len(train_dataloader)*num_epochs*0.1) #10% of train_data warm up
logging.info(f"Warmup-steps:{warmup_steps}")

model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path
)


Downloading:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Downloading and preparing dataset klue/sts (download: 1.29 MiB, generated: 2.82 MiB, post-processed: Unknown size, total: 4.11 MiB) to /root/.cache/huggingface/datasets/klue/sts/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90...


Downloading:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset klue downloaded and prepared to /root/.cache/huggingface/datasets/klue/sts/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

Downloading and preparing dataset kor_nlu/sts (download: 1.53 MiB, generated: 1.54 MiB, post-processed: Unknown size, total: 3.07 MiB) to /root/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8...


Downloading:   0%|          | 0.00/282k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/89.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/66.1k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset kor_nlu downloaded and prepared to /root/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

2021-11-11 04:07:16 - Use pytorch device: cuda
2021-11-11 04:07:16 - Warmup-steps:146


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/365 [00:00<?, ?it/s]

2021-11-11 04:13:42 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:
2021-11-11 04:13:47 - Cosine-Similarity :	Pearson: 0.8733	Spearman: 0.8705
2021-11-11 04:13:47 - Manhattan-Distance:	Pearson: 0.8753	Spearman: 0.8694
2021-11-11 04:13:47 - Euclidean-Distance:	Pearson: 0.8760	Spearman: 0.8700
2021-11-11 04:13:47 - Dot-Product-Similarity:	Pearson: 0.8627	Spearman: 0.8574
2021-11-11 04:13:47 - Save model to output/training_klue_sts_klue-roberta-base-2021-11-11_04-06-45


Iteration:   0%|          | 0/365 [00:00<?, ?it/s]

2021-11-11 04:19:52 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 1:
2021-11-11 04:19:56 - Cosine-Similarity :	Pearson: 0.8842	Spearman: 0.8808
2021-11-11 04:19:56 - Manhattan-Distance:	Pearson: 0.8850	Spearman: 0.8781
2021-11-11 04:19:56 - Euclidean-Distance:	Pearson: 0.8855	Spearman: 0.8787
2021-11-11 04:19:56 - Dot-Product-Similarity:	Pearson: 0.8733	Spearman: 0.8662
2021-11-11 04:19:56 - Save model to output/training_klue_sts_klue-roberta-base-2021-11-11_04-06-45


Iteration:   0%|          | 0/365 [00:00<?, ?it/s]

2021-11-11 04:26:02 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 2:
2021-11-11 04:26:07 - Cosine-Similarity :	Pearson: 0.8869	Spearman: 0.8875
2021-11-11 04:26:07 - Manhattan-Distance:	Pearson: 0.8878	Spearman: 0.8834
2021-11-11 04:26:07 - Euclidean-Distance:	Pearson: 0.8881	Spearman: 0.8842
2021-11-11 04:26:07 - Dot-Product-Similarity:	Pearson: 0.8759	Spearman: 0.8723
2021-11-11 04:26:07 - Save model to output/training_klue_sts_klue-roberta-base-2021-11-11_04-06-45


Iteration:   0%|          | 0/365 [00:00<?, ?it/s]

2021-11-11 04:32:13 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 3:
2021-11-11 04:32:17 - Cosine-Similarity :	Pearson: 0.8902	Spearman: 0.8910
2021-11-11 04:32:17 - Manhattan-Distance:	Pearson: 0.8911	Spearman: 0.8868
2021-11-11 04:32:17 - Euclidean-Distance:	Pearson: 0.8916	Spearman: 0.8873
2021-11-11 04:32:17 - Dot-Product-Similarity:	Pearson: 0.8793	Spearman: 0.8756
2021-11-11 04:32:17 - Save model to output/training_klue_sts_klue-roberta-base-2021-11-11_04-06-45


In [4]:
from sklearn.metrics import mean_squared_error

sentences1 = ['This list contains the first column', 'With your sentences', 'You want your model to evaluate on']
sentences2 = ['Sentences contains the other column', 'The evaluator matches sentences1[i] with sentences2[i]', 'Compute the cosine similarity and compares it to scores[i]']
scores = [0.3, 0.6, 0.2]

cos_scores = []
#Output the pairs with their score
for i in range(len(sentences1)):
    #Compute embedding for both lists
    embeddings1 = model.encode(sentences1[i], convert_to_tensor=True)
    embeddings2 = model.encode(sentences2[i], convert_to_tensor=True)

    #Compute cosine-similarits
    cosine_score = util.pytorch_cos_sim(embeddings1, embeddings2).item()
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_score))
    cos_scores.append(cosine_score)

mean_squared_error(scores, cos_scores)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This list contains the first column 		 Sentences contains the other column 		 Score: 0.4030


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

With your sentences 		 The evaluator matches sentences1[i] with sentences2[i] 		 Score: 0.4684


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

You want your model to evaluate on 		 Compute the cosine similarity and compares it to scores[i] 		 Score: 0.2885


0.01191797069541362

In [19]:
len(datasets["train"])

11668

In [5]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')

In [6]:
test_evaluator(model, output_path=model_save_path)

2021-11-11 04:34:06 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2021-11-11 04:34:16 - Cosine-Similarity :	Pearson: 0.7609	Spearman: 0.7559
2021-11-11 04:34:16 - Manhattan-Distance:	Pearson: 0.7650	Spearman: 0.7625
2021-11-11 04:34:16 - Euclidean-Distance:	Pearson: 0.7637	Spearman: 0.7618
2021-11-11 04:34:16 - Dot-Product-Similarity:	Pearson: 0.7240	Spearman: 0.7163


0.7624975592092137

In [12]:
test_samples[0].label

0.5

In [None]:
cos_scores = []
golden_scores = []
#Output the pairs with their score
for i in range(len(test_samples)):
    #Compute embedding for both lists
    embeddings1 = model.encode(test_samples[i].texts[0], convert_to_tensor=True)
    embeddings2 = model.encode(test_samples[i].texts[1], convert_to_tensor=True)

    #Compute cosine-similarits
    cosine_score = util.pytorch_cos_sim(embeddings1, embeddings2).item()
    print("{} \t\t {} \t\t Score: {:.4f}".format(test_samples[i].texts[0], test_samples[i].texts[1], cosine_score))
    cos_scores.append(cosine_score) # 모델에 의한 코사인 유사도 산출 값
    golden_scores.append(test_samples[i].label) #원래 라벨 값

print('MSE : ')
mean_squared_error(golden_scores, cos_scores)

In [15]:
print('MSE : ')
mean_squared_error(golden_scores, cos_scores)

MSE : 


0.046159493475050835