In [1]:
import random
import argparse
from utils.utils import load_config
import torch
from tqdm.auto import tqdm
from utils.utils import ckpt_save
import pandas as pd
import transformers
import torchmetrics
import pytorch_lightning as pl


from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, losses, SentenceTransformerTrainer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from scipy.stats import pearsonr
import numpy as np

# seed 고정
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def compute_metrics(eval_pred): 
    logits, labels = eval_pred
    
    # logits을 제대로 차원 축소하는지 확인
    predictions = logits.squeeze()

    # labels가 텐서로 들어올 경우 numpy로 변환
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    # Pearson 상관계수 계산
    pearson_corr, _ = pearsonr(predictions, labels)

    return {"pearson_corr": pearson_corr}

In [10]:
# 1. 학습 데이터 준비
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_path = "../../data/train.csv"
train_dataset = pd.read_csv(train_path)
train_dataset = train_dataset[["sentence_1","sentence_2","label"]]
train_dataset["label"] = scaler.fit_transform(np.array(train_dataset["label"]).reshape(-1,1))
train_dataset = Dataset.from_pandas(train_dataset)

val_path = "../../data/dev.csv"
val_dataset = pd.read_csv(val_path)
val_dataset = val_dataset[["sentence_1","sentence_2","label"]]
val_dataset["label"] = scaler.transform(np.array(val_dataset["label"]).reshape(-1,1))
val_dataset = Dataset.from_pandas(val_dataset)

In [3]:
model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)
train_loss = losses.MSELoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir="./saved_model/KR_SBERT",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    lr_scheduler_type="linear"
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=train_loss,
    compute_metrics=compute_metrics,
)

trainer.train()
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_dataset["sentence_1"],
    sentences2=val_dataset["sentence_2"],
    scores=val_dataset["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
dev_evaluator(model)

model.save_pretrained("./saved_model/KR_SBERT_TEST/final")

  return self.fget.__get__(instance, owner)()
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
500,0.0185,0.027847


                                                                     

In [5]:

# model load and test

test_path = "../../data/test.csv"
test_dataset = pd.read_csv(test_path)
test_dataset = test_dataset[["sentence_1","sentence_2"]]


test_model = SentenceTransformer("./saved_model/KR_SBERT_TEST/final")

predictions = []
for i in range(len(test_dataset)):
    sentence_1 = test_dataset.iloc[i]["sentence_1"]
    sentence_2 = test_dataset.iloc[i]["sentence_2"]
    
    embeddings = test_model.encode([sentence_1, sentence_2], convert_to_tensor=True)
    
    data['similarity'] = [util.cos_sim(sent1, sent2).squeeze() for sent1, sent2 in tqdm(zip(vec1, vec2), total=len(data))]
    # score = 2.5 * (similarities.item() + 1)
    # print(f"Similarities = {similarities.item()}")
    if similarities < 0 :
        print(similarities)
    # print(f"Score = {score}")
    predictions.append(similarities)

predictions = list(round(float(i), 1) for i in predictions)
output = pd.read_csv("../../data/sample_submission.csv")
output["target"] = predictions
output.to_csv('./output/output_SBERT.csv', index=False)
print("Complete Extract ouptut.csv")


tensor(-0.0300, device='cuda:0')
tensor(-0.0651, device='cuda:0')
tensor(-0.0302, device='cuda:0')
tensor(-0.0216, device='cuda:0')
tensor(-0.0462, device='cuda:0')
tensor(-0.0530, device='cuda:0')
tensor(-0.0036, device='cuda:0')
tensor(-0.0465, device='cuda:0')
tensor(-0.1353, device='cuda:0')
tensor(-0.0226, device='cuda:0')
Complete Extract ouptut.csv
