<a href="https://colab.research.google.com/github/arooncookiedodo/HuggingFaceStudy/blob/main/LLMembed_SmolLM2_360M_stackOverFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers tqdm datasets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from datasets import load_dataset

# 멀티 레이블 데이터셋 로드
sof_dataset_train = load_dataset(
    'csv',
    data_files='/content/drive/MyDrive/AiExpertCource/project/dataset/rev_tag_training_samples.csv',
    split='train'
)

# Title과 Body를 결합하여 Body 업데이트
def update_body(example):
    updated_body = example['Title'] + ' ' + example['Body']
    example['Body'] = updated_body
    return example

# map 함수를 사용하여 모든 데이터에 update_body 함수 적용
sof_dataset_train = sof_dataset_train.map(update_body)

# # 변경된 데이터셋 출력
# print(sof_dataset_train['Title'][0])
# print(sof_dataset_train['Body'][0])
print(sof_dataset_train)

# 레이블 이름으로 구성된 리스트
labels = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

# 레이블 이름을 키로 하고 카운터를 값으로 하는 딕셔너리 생성
label_counter = {label: 0 for label in labels}

for example in sof_dataset_train:
    # 각 샘플에서 0과 1로 표현된 레이블 상태를 이용해 카운트를 업데이트합니다.
    for label in labels:
        if example[label] == 1:
            label_counter[label] += 1

print(label_counter)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
    num_rows: 20000
})
{'Algorithms': 4225, 'Backend': 3723, 'Data Science': 3487, 'Databases': 3072, 'Dev Tools': 3655, 'Frontend': 3966, 'Mobile': 2683, 'Systems': 3996, 'iOS/macOS': 2587}


In [4]:
from datasets import load_dataset

# 멀티 레이블 데이터셋 로드
sof_dataset_val = load_dataset(
    'csv',
    data_files='/content/drive/MyDrive/AiExpertCource/project/dataset/rev_tag_validation_samples.csv',
    split='train'
)

# Title과 Body를 결합하여 Body 업데이트
def update_body(example):
    updated_body = example['Title'] + ' ' + example['Body']
    example['Body'] = updated_body
    return example

# map 함수를 사용하여 모든 데이터에 update_body 함수 적용
sof_dataset_val = sof_dataset_val.map(update_body)

# print(sof_dataset_val['Title'][0])
# print(sof_dataset_val['Body'][0])
print(sof_dataset_val)

# 레이블 이름으로 구성된 리스트
labels = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

# 레이블 이름을 키로 하고 카운터를 값으로 하는 딕셔너리 생성
label_counter = {label: 0 for label in labels}

for example in sof_dataset_val:
    # 각 샘플에서 0과 1로 표현된 레이블 상태를 이용해 카운트를 업데이트합니다.
    for label in labels:
        if example[label] == 1:
            label_counter[label] += 1

print(label_counter)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6500 [00:00<?, ? examples/s]

Dataset({
    features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
    num_rows: 6500
})
{'Algorithms': 1573, 'Backend': 722, 'Data Science': 1519, 'Databases': 230, 'Dev Tools': 656, 'Frontend': 1821, 'Mobile': 167, 'Systems': 1178, 'iOS/macOS': 28}


In [5]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

sof_dataset= DatasetDict({
    "train": sof_dataset_train,
    "test": sof_dataset_val
})

# DatasetDict 출력
print(sof_dataset)


DatasetDict({
    train: Dataset({
        features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['Title', 'Body', 'Tags_filtered', 'Tags_list', 'Tags_new', 'Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS'],
        num_rows: 6500
    })
})


# **1. SmolLM Embedding**

In [9]:
# -*- coding: utf-8 -*-
import os
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from tqdm import trange
from datasets import load_dataset
import argparse

def rep_extract(task, mode, device, sents, labels, max_len, step):

    # 모델 변경
    model_id = "HuggingFaceTB/SmolLM2-360M"

    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token="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")
    tokenizer.pad_token = tokenizer.eos_token

    config_kwargs = {
        "trust_remote_code": True,
        "cache_dir": None,
        "revision": 'main',
        "use_auth_token": "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",
        "output_hidden_states": True
    }
    model_config = AutoConfig.from_pretrained(model_id, **config_kwargs)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_auth_token="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",
        trust_remote_code=True,
        config=model_config,
        device_map=device,
        torch_dtype=torch.float16)
    model.eval()

    sents_reps = []
    for idx in trange(0, len(sents), step):
        idx_end = idx + step
        if idx_end > len(sents):
            idx_end = len(sents)
        sents_batch = sents[idx: idx_end]

        sents_batch_encoding = tokenizer(sents_batch, return_tensors='pt', max_length=max_len, padding="max_length", truncation=True)
        sents_batch_encoding = sents_batch_encoding.to(device)

        with torch.no_grad():
            batch_outputs = model(**sents_batch_encoding)
            reps_batch_5L = []

            # 마지막 5개 레이어의 평균을 추출
            for layer in range(-1, -6, -1):
                reps_batch_5L.append(torch.mean(batch_outputs.hidden_states[layer], axis=1))
            reps_batch_5L = torch.stack(reps_batch_5L, axis=1)

        sents_reps.append(reps_batch_5L.cpu())
    sents_reps = torch.cat(sents_reps)

    # 멀티 레이블 데이터를 처리하기 위한 레이블 로직 수정
    labels = torch.tensor(labels, dtype=torch.float32)

    print(sents_reps.shape)
    print(labels.shape)
    path = f'/content/drive/MyDrive/AiExpertCource/project/dataset/smolLM2/{task}/dataset_tensor/'
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(sents_reps.to('cpu'), path + f'{mode}_sents.pt')
    torch.save(labels, path + f'{mode}_labels.pt')

if __name__ == '__main__':
    cuda_no = 0  # GPU 번호
    task = 'stackoverflow'  # Task 이름
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 레이블에 해당하는 컬럼 이름 목록
    label_cols = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

    # 학습 데이터셋에서 텍스트 추출
    sents = sof_dataset['train']['Body']

    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # 각 행에 대한 루프
    for i in range(len(sof_dataset['train'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        for category in label_cols:
            labels_temp.append(sof_dataset['train'][category][i])  # 현재 행의 해당 카테고리 레이블 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가

    rep_extract(task, 'train', device, sents, labels, 2048, 5)

    # 테스트 데이터셋에서 텍스트 추출
    sents = sof_dataset['test']['Body']

    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # 각 행에 대한 루프
    for i in range(len(sof_dataset['test'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        for category in label_cols:
            labels_temp.append(sof_dataset['test'][category][i])  # 현재 행의 해당 카테고리 레이블 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가

    rep_extract(task, 'test', device, sents, labels, 2048, 5)




tokenizer_config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

  0%|          | 0/4000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 4000/4000 [09:41<00:00,  6.88it/s]


torch.Size([20000, 5, 960])
torch.Size([20000, 9])


100%|██████████| 1300/1300 [03:08<00:00,  6.90it/s]


torch.Size([6500, 5, 960])
torch.Size([6500, 9])


# **2. BERT Embedding**

In [7]:
# -*- coding: utf-8 -*-
import os
import torch
from transformers import BertTokenizer, BertModel
from tqdm import trange
from datasets import load_dataset
import argparse

def rep_extract(task, mode, device, sents, labels):
    model_path = 'google-bert/bert-large-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path).to(device)
    model.eval()

    max_len = 512
    sents_reps = []
    step = 10
    for idx in trange(0, len(sents), step):
        idx_end = idx + step
        if idx_end > len(sents):
            idx_end = len(sents)
        sents_batch = sents[idx: idx_end]

        sents_batch_encoding = tokenizer(sents_batch, return_tensors='pt', max_length=max_len, padding="max_length", truncation=True)
        sents_batch_encoding = sents_batch_encoding.to(device)

        with torch.no_grad():
            batch_outputs = model(**sents_batch_encoding)
            reps_batch = batch_outputs.pooler_output
        sents_reps.append(reps_batch.cpu())
    sents_reps = torch.cat(sents_reps)

    # 멀티 레이블 데이터를 처리하기 위한 레이블 로직 수정
    labels = torch.tensor(labels, dtype=torch.float32)

    print(sents_reps.shape)
    print(labels.shape)
    path = f'/content/drive/MyDrive/AiExpertCource/project/dataset/bert/{task}/dataset_tensor/'
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(sents_reps.to('cpu'), path + f'{mode}_sents.pt')
    torch.save(labels, path + f'{mode}_labels.pt')

if __name__ == '__main__':

    cuda_no = 0 # GPU 번호
    task = 'stackoverflow' # Task 이름
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 레이블에 해당하는 컬럼 이름 목록
    label_cols = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

    # 데이터셋 로딩
    # sof_dataset = load_dataset("path_to_your_dataset")  # 실제 데이터셋 경로로 변경 필요

    sents = sof_dataset['train']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    # print(sents)
    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['train'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:

            labels_temp.append(sof_dataset['train'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가

    rep_extract(task, 'train', device, sents, labels)

    sents = sof_dataset['test']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['test'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:
            labels_temp.append(sof_dataset['test'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가


    rep_extract(task, 'test', device, sents, labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

100%|██████████| 2000/2000 [08:08<00:00,  4.09it/s]


torch.Size([20000, 1024])
torch.Size([20000, 9])


100%|██████████| 650/650 [02:36<00:00,  4.15it/s]

torch.Size([6500, 1024])
torch.Size([6500, 9])





# **3. Roberta Embedding**

In [8]:
# -*- coding: utf-8 -*-
import os
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import trange
from datasets import load_dataset

def rep_extract(task, mode, device, sents, labels):
    model_path = 'roberta-large'
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    model = RobertaModel.from_pretrained(model_path).to(device)
    model.eval()

    max_len = 512
    sents_reps = []
    step = 10  # 작은 배치 크기로 설정하는 것이 좋습니다, 메모리 오류를 방지하기 위해
    for idx in trange(0, len(sents), step):
        idx_end = idx + step
        if idx_end > len(sents):
            idx_end = len(sents)
        sents_batch = sents[idx: idx_end]

        sents_batch_encoding = tokenizer(sents_batch, return_tensors='pt', max_length=max_len, padding="max_length", truncation=True)
        sents_batch_encoding = sents_batch_encoding.to(device)

        with torch.no_grad():
            batch_outputs = model(**sents_batch_encoding)
            # 첫 번째 토큰([CLS] 토큰에 해당)의 표현을 추출
            reps_batch = batch_outputs.last_hidden_state[:, 0, :]
        sents_reps.append(reps_batch.cpu())
    sents_reps = torch.cat(sents_reps)

    # 멀티 레이블 데이터를 처리하기 위한 레이블 로직 수정
    labels = torch.tensor(labels, dtype=torch.float32)

    print(sents_reps.shape)
    print(labels.shape)
    path = f'/content/drive/MyDrive/AiExpertCource/project/dataset/roberta/{task}/dataset_tensor/'
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(sents_reps.to('cpu'), path + f'{mode}_sents.pt')
    torch.save(labels, path + f'{mode}_labels.pt')

if __name__ == '__main__':
    cuda_no = 0 # GPU 번호
    task = 'stackoverflow' # Task 이름

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 레이블에 해당하는 컬럼 이름 목록
    label_cols = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

    # 데이터셋 로딩
    # sof_dataset = load_dataset("path_to_your_dataset")  # 실제 데이터셋 경로로 변경 필요

    sents = sof_dataset['train']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    # print(sents)
    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['train'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:

            labels_temp.append(sof_dataset['train'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가

    rep_extract(task, 'train', device, sents, labels)

    sents = sof_dataset['test']['Body']  # 데이터 프레임에서 텍스트만을 추출하는 부분 수정 필요

    labels = []  # 최종 레이블을 저장할 2차원 리스트 초기화

    # sof_dataset['train']의 각 행에 대한 루프
    for i in range(len(sof_dataset['test'])):
        labels_temp = []  # 현재 행의 레이블을 저장할 임시 리스트
        # 각 카테고리에 대해 루프를 돌면서 labels_temp 리스트에 레이블을 추가
        for category in label_cols:
            labels_temp.append(sof_dataset['test'][category][i])  # 현재 행의 해당 카테고리 레이블을 추가
        labels.append(labels_temp)  # 완성된 레이블 리스트를 labels 리스트에 추가


    rep_extract(task, 'test', device, sents, labels)



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2000/2000 [08:17<00:00,  4.02it/s]


torch.Size([20000, 1024])
torch.Size([20000, 9])


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 650/650 [02:41<00:00,  4.02it/s]

torch.Size([6500, 1024])
torch.Size([6500, 9])





# **4. Dataset 정의**

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):

    # 4개의 텐서 파일을 가져옴
    def __init__(self, mode, l_path, b_path, r_path):
        self.l_sents_reps = torch.load(l_path + f'{mode}_sents.pt')
        self.b_sents_reps = torch.load(b_path + f'{mode}_sents.pt')
        self.r_sents_reps = torch.load(r_path + f'{mode}_sents.pt')

        self.labels = torch.load(l_path + f'{mode}_labels.pt')

        self.sample_num = self.labels.shape[0]

    # 인덱스를 받아 해당 샘플의 데이터를 반환하는 메소드
    def __getitem__(self, index):
        return self.l_sents_reps[index], self.b_sents_reps[index], self.r_sents_reps[index], self.labels[index]

    # 데이터셋의 총 샘플 수를 반환하는 메소드
    def __len__(self):
        return self.sample_num

# * Model Operation - 검증셋 테스트

In [12]:
import locale

# Override the default locale encoding to UTF-8
locale.getpreferredencoding = lambda: "UTF-8"

!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
import numpy as np
import wandb
import evaluate
from tqdm import tqdm
import torch

# wandb.init(project="huggingface") # Uncomment to use wandb

# Load metrics for multi-label classification
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")



# Function to compute metrics for multi-label classification
def compute_metrics(predictions, references):
    # For multi-label classification, we need to threshold our predictions
    predictions = (torch.sigmoid(predictions) > 0.5).int()
    references = references.int()


    # Convert torch tensors to numpy arrays
    predictions = predictions.cpu().numpy()
    references = references.cpu().numpy()

    # Flatten arrays to compute global metrics, not per class
    flat_predictions = predictions.flatten()
    flat_references = references.flatten()

    # Calculate sample-based accuracy
    sample_accuracy = (predictions == references).all(axis=1).mean()

    metrics = {
        "sample_accuracy": sample_accuracy,
        "flat_accuracy": accuracy_metric.compute(predictions=flat_predictions, references=flat_references)["accuracy"],
        "precision": precision_metric.compute(predictions=flat_predictions, references=flat_references, average="weighted")["precision"],
        "recall": recall_metric.compute(predictions=flat_predictions, references=flat_references, average="weighted")["recall"],
        "f1": f1_metric.compute(predictions=flat_predictions, references=flat_references, average="weighted")["f1"],
    }
    return metrics

# Training and validation function for multi-label classification
def Train_and_Evaluate(dataloader_train, dataloader_val, device, model, loss_fn, optimizer):
    # Training
    model.train()
    total_train_loss = 0
    for batch in tqdm(dataloader_train):
        batch_l, batch_b, batch_r, batch_y = [item.to(device) for item in batch]
        pred = model(batch_l.float(), batch_b.float(), batch_r.float())
        loss = loss_fn(pred, batch_y.float())  # Make sure batch_y is a float tensor
        total_train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(dataloader_train)

    # Validation
    model.eval()
    total_val_loss = 0
    all_predictions = []
    all_references = []
    with torch.no_grad():
        for batch in tqdm(dataloader_val):
            batch_l, batch_b, batch_r, batch_y = [item.to(device) for item in batch]
            pred = model(batch_l.float(), batch_b.float(), batch_r.float())
            loss = loss_fn(pred, batch_y.float())  # Make sure batch_y is a float tensor
            total_val_loss += loss.item()

            # Threshold predictions for multi-label classification
            pred_y = (torch.sigmoid(pred) > 0.5).int()
            all_predictions.append(pred_y)
            all_references.append(batch_y)

    # Concatenate all predictions and references
    all_predictions = torch.cat(all_predictions, dim=0)
    all_references = torch.cat(all_references, dim=0)

    avg_val_loss = total_val_loss / len(dataloader_val)

    # Compute metrics
    metrics = compute_metrics(all_predictions, all_references)
    metrics['train_loss'] = avg_train_loss
    metrics['val_loss'] = avg_val_loss

    # Uncomment the following line to log metrics to wandb
    # wandb.log(metrics)

    print({metric: f"{value:.4f}" for metric, value in metrics.items()})

# Example usage
# Train_and_Evaluate(dataloader_train, dataloader_val, device, model, loss_fn, optimizer)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

# **6. Downstream Model Class**

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DownstreamModel(nn.Module):
    def __init__(self, class_num, SIGMA):
        super(DownstreamModel, self).__init__()
        self.SIGMA = SIGMA
        self.compress_layers = nn.ModuleList()
        for _ in range(5):
            layers = []
            # Change the input size of the linear layer to 960 to match the input_l shape
            layers.append(nn.Linear(960, 1024))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.5))
            self.compress_layers.append(nn.Sequential(*layers))

        # Change the input size of fc1 to match the actual input size (1009)
        self.fc1 = nn.Linear(1009, 1024)

        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, 256)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, class_num)
        # 변경된 부분: softmax 대신 sigmoid 활성화 함수 사용
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, input_l, input_b, input_r):
        batch_size = input_l.shape[0]

        # input_l 텐서를 첫 번째 차원을 기준으로 1 크기의 텐서로 분할
        split_tensors = torch.split(input_l, 1, dim=1)
        input = []

        # 분할된 텐서들을 순회
        for i, split_tensor in enumerate(split_tensors):
            # 각 split_tensor를 배치 크기에 맞게 2차원으로 재구성
            split_tensor = split_tensor.reshape(batch_size,-1)
            # 재구성된 텐서를 압축(compress) layer를 거쳐 변환
            input.append(self.compress_layers[i](split_tensor))

        # input_b(bert 임베딩)와 input_r(Roberta 임베딩)을 input에 추가
        input.append(input_b)
        input.append(input_r)
        input = torch.stack(input, dim=1)
        # X * X^T
        input_T = input.transpose(1, 2)
        input_P = torch.matmul(input, input_T)
        input_P = input_P.reshape(batch_size, -1)
        # PN func
        input_P = 2*F.sigmoid(self.SIGMA * input_P) - 1

        a = torch.mean(input_l, dim=1)
        input = torch.cat([input_P, a], dim=1)

        output = self.fc1(input) # input is (batch_size, 1009)
        output = self.relu1(output)
        output = self.dropout1(output)
        output = self.fc2(output)
        output = self.relu2(output)
        output = self.dropout2(output)
        output = self.fc3(output)  # 마지막 레이어로 클래스 수에 맞게 출력

        return output  # output은 (batch_size, class_num) 형태여야 함


# **7. Fine-tuning**

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader



if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    task = 'stackoverflow'  # 사용 가능한 옵션: 'sst2', 'mr', 'agnews', 'r8', 'r52', 'stackoverflow' 중 하나
    epochs = 10  # 원하는 에폭 수
    SIGMA = 0.1  # SIGMA 값을 설정
    batch_size = 1024  # 배치 크기 설정
    lr = 1e-4  # 학습률 설정

    class_num = {'sst2': 2, 'mr': 2, 'agnews': 5, 'r8': 8, 'r52': 52, 'stackoverflow': 9}
    class_num = class_num[task]

    l_dataset_path = f'/content/drive/MyDrive/AiExpertCource/project/dataset/smolLM2/{task}/dataset_tensor/'
    b_dataset_path = f'/content/drive/MyDrive/AiExpertCource/project/dataset/bert/{task}/dataset_tensor/'
    r_dataset_path = f'/content/drive/MyDrive/AiExpertCource/project/dataset/roberta/{task}/dataset_tensor/'

    # Train dataset
    train_data = MyDataset('train', l_dataset_path, b_dataset_path, r_dataset_path)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

    # Test dataset
    test_data = MyDataset('test', l_dataset_path, b_dataset_path, r_dataset_path)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

    model = DownstreamModel(class_num, SIGMA).to(device)

    loss_fn = nn.BCEWithLogitsLoss().to(device)  # 멀티 레이블 손실 함수
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print('training ...')
    for epoch in range(epochs):
        model = model.to(device)
        print(f'--------------------------- epoch {epoch} ---------------------------')
        Train_and_Evaluate(train_loader, test_loader, device, model, loss_fn, optimizer)

    # 모델 학습 후, 모델의 가중치 저장하기
    model_save_path = f"/content/drive/MyDrive/AiExpertCource/project/dataset/{task}_model_weights.pth"
    torch.save(model.state_dict(), model_save_path)


  self.l_sents_reps = torch.load(l_path + f'{mode}_sents.pt')
  self.b_sents_reps = torch.load(b_path + f'{mode}_sents.pt')
  self.r_sents_reps = torch.load(r_path + f'{mode}_sents.pt')
  self.labels = torch.load(l_path + f'{mode}_labels.pt')


training ...
--------------------------- epoch 0 ---------------------------


  0%|          | 0/20 [00:00<?, ?it/s]Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff6233e7be0><function _MultiProcessingDataLoaderIter.__del__ at 0x7ff6233e7be0>Exception ignored in: 

Exception ignored in: Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7ff6233e7be0>Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7ff6233e7be0>  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__

Traceback (most recent call last):
Traceback (most recent call last):
      File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
      File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
self._shutdown_workers()self._shutdown_wo

{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 1 ---------------------------


  0%|          | 0/20 [00:00<?, ?it/s]Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff6233e7be0><function _MultiProcessingDataLoaderIter.__del__ at 0x7ff6233e7be0>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
        self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1587, in _shutdown_workers
        if w.is_alive():if w.is_alive():

  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
        assert self._

{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 2 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 39.42it/s]
100%|██████████| 7/7 [00:00<00:00, 21.14it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 3 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 39.08it/s]
100%|██████████| 7/7 [00:00<00:00, 21.37it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 4 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 39.01it/s]
100%|██████████| 7/7 [00:00<00:00, 21.05it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 5 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 37.05it/s]
100%|██████████| 7/7 [00:00<00:00, 21.01it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 6 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 36.95it/s]
100%|██████████| 7/7 [00:00<00:00, 20.85it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 7 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 33.70it/s]
100%|██████████| 7/7 [00:00<00:00, 19.70it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 8 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 39.46it/s]
100%|██████████| 7/7 [00:00<00:00, 21.78it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}
--------------------------- epoch 9 ---------------------------


100%|██████████| 20/20 [00:00<00:00, 38.96it/s]
100%|██████████| 7/7 [00:00<00:00, 21.00it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'sample_accuracy': '0.0000', 'flat_accuracy': '0.8651', 'precision': '0.7483', 'recall': '0.8651', 'f1': '0.8025', 'train_loss': 'nan', 'val_loss': 'nan'}


# **8. 추론 메서드 정의**

In [30]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from transformers import AutoConfig

# Load the tokenizers and models for SmolLM, BERT, and Roberta
smollm_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M", use_auth_token="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", trust_remote_code=True)
smollm_tokenizer.pad_token = smollm_tokenizer.eos_token  # 패딩 토큰 설정
smollm_config = AutoConfig.from_pretrained("HuggingFaceTB/SmolLM2-360M", use_auth_token="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", output_hidden_states=True)
smollm_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-360M", use_auth_token="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", config=smollm_config)

bert_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-large-uncased')
bert_model = BertModel.from_pretrained('google-bert/bert-large-uncased')

roberta_tokenizer = RobertaTokenizer.from_pretrained('FacebookAI/roberta-large')
roberta_model = RobertaModel.from_pretrained('FacebookAI/roberta-large')

# Make sure all models are in evaluation mode and moved to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
smollm_model.eval().to(device)
bert_model.eval().to(device)
roberta_model.eval().to(device)

# Initialize the downstream model
class_num = 9  # For example, if you have 8 classes
SIGMA = 0.1  # SIGMA value for your downstream model
downstream_model = DownstreamModel(class_num, SIGMA).to(device)

model_load_path = '/content/drive/MyDrive/AiExpertCource/project/dataset/stackoverflow_model_weights.pth'

# 가중치 로드
downstream_model.load_state_dict(torch.load(model_load_path, map_location=device))
downstream_model.eval()

def get_smollm_embedding(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Average the last 5 layers
        embedding = torch.stack([torch.mean(outputs.hidden_states[i], dim=1) for i in range(-1, -6, -1)], dim=1)
    return embedding

def get_bert_embedding(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Use pooler_output for BERT embeddings
        embedding = outputs.pooler_output
    return embedding

def get_roberta_embedding(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the first token ([CLS] token) representation
        embedding = outputs.last_hidden_state[:, 0, :]
    return embedding

def infer(text, downstream_model, device):
    # 각 모델로부터 임베딩을 추출
    smollm_emb = get_smollm_embedding(text, smollm_tokenizer, smollm_model, device)
    bert_emb = get_bert_embedding(text, bert_tokenizer, bert_model, device)
    roberta_emb = get_roberta_embedding(text, roberta_tokenizer, roberta_model, device)

    # 모든 임베딩을 float 타입으로 변환
    smollm_emb = smollm_emb.float()
    bert_emb = bert_emb.float()
    roberta_emb = roberta_emb.float()

    # Forward pass through the downstream model
    with torch.no_grad():
        prediction = downstream_model(smollm_emb, bert_emb, roberta_emb)
        # Apply sigmoid to obtain probabilities
        prediction = torch.sigmoid(prediction)

    return prediction


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  downstream_model.load_state_dict(torch.load(model_load_path, map_location=device))


In [32]:

# 예측된 클래스를 레이블로 맵핑하여 출력하는 함수
def print_predicted_labels(predicted_classes, labels):
    # 예측된 클래스 중 1인 레이블만 선택
    predicted_labels = [labels[i] for i, pred in enumerate(predicted_classes) if pred == 1]

    # 선택된 레이블 출력
    print("Predicted labels:", predicted_labels)


# 레이블 목록
labels = ['Algorithms', 'Backend', 'Data Science', 'Databases', 'Dev Tools', 'Frontend', 'Mobile', 'Systems', 'iOS/macOS']

# 주어진 텍스트를 토대로 예측을 5번 실행
texts = [
    "working of compareTo() method of Comparable interface            I have one Employee class and the requirement is to sort the objects using comparable interface. The output with this code is :The difference of this id and other id is..** 6  other id**1The difference of this id and other id is..** 3  other id**6The difference of this id and other id is..** 3  other id**6The difference of this id and other id is..** 3  other id**1The difference of this id and other id is..** 11  other id**3The difference of this id and other id is..** 11  other id**6",
]

# 각 텍스트에 대해 예측 실행 및 레이블 출력
for i, text in enumerate(texts):
    prediction = infer(text, downstream_model, device)

    # 예측 결과를 CPU로 이동
    prediction = prediction.cpu()
    predicted_classes = (prediction > 0.5).int().numpy()

    # 예측 결과 출력
    print("Predicted probabilities:", prediction.numpy())
    print("Predicted classes:", predicted_classes)


    print_predicted_labels(predicted_classes[0], labels)

Predicted probabilities: [[nan nan nan nan nan nan nan nan nan]]
Predicted classes: [[0 0 0 0 0 0 0 0 0]]
Predicted labels: []
