<a href="https://colab.research.google.com/github/arkwith7/RAG_LLM/blob/main/E5_finetune_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. 필요한 라이브러리 설치
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [9]:
# 2. Hugging Face와 Google Colab GPU 환경 설정
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch

In [10]:
# Google Colab에서 GPU 사용 설정 확인
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [11]:
# 3. 모델과 토크나이저 로드 (Multilingual E5 모델 불러오기)
model_name = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [12]:
# 4. 데이터셋 불러오기 및 전처리
# ko-triplet-v1.0 데이터셋 로드
dataset = load_dataset("nlpai-lab/ko-triplet-v1.0")

# 데이터셋에서 첫 100개 샘플만 선택
small_dataset = dataset['train'].select(range(100))

# Positive와 Negative 샘플을 각각 분리하여 텐서 길이 문제를 해결
def preprocess_function(examples):
    # Anchor-Positive 쌍의 입력 구성
    positive_inputs = tokenizer(examples['query'], examples['document'], truncation=True, padding='max_length', max_length=128)
    positive_inputs['labels'] = [1] * len(examples['query'])  # Positive 예시의 레이블 1

    # Anchor-Negative 쌍의 입력 구성
    negative_inputs = tokenizer(examples['query'], examples['hard_negative'], truncation=True, padding='max_length', max_length=128)
    negative_inputs['labels'] = [0] * len(examples['query'])  # Negative 예시의 레이블 0

    # Positive와 Negative 쌍을 분리하여 데이터셋을 구성
    combined_inputs = {
        key: positive_inputs[key] + negative_inputs[key] for key in positive_inputs.keys()
    }
    return combined_inputs

# 데이터셋에 전처리 적용
processed_dataset = small_dataset.map(preprocess_function, batched=True, remove_columns=small_dataset.column_names)

# 학습 및 검증 세트로 나누기 (80:20 비율)
train_size = int(0.8 * len(processed_dataset))
train_dataset = Dataset.from_dict(processed_dataset[:train_size])
valid_dataset = Dataset.from_dict(processed_dataset[train_size:])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
# 5. 파인튜닝을 위한 하이퍼파라미터 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  # Mixed precision training
)



In [14]:
# 6. Trainer를 사용하여 파인튜닝 실행
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
# 7. 파인튜닝 시작
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.6932,1.200281
2,0.6225,1.062329
3,0.6461,1.323364


TrainOutput(global_step=60, training_loss=0.6485682169596354, metrics={'train_runtime': 297.7406, 'train_samples_per_second': 1.612, 'train_steps_per_second': 0.202, 'total_flos': 111831763599360.0, 'train_loss': 0.6485682169596354, 'epoch': 3.0})

In [16]:
# 8. 모델 평가 (Validation 세트에서 평가)
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

Evaluation results: {'eval_loss': 1.3233642578125, 'eval_runtime': 0.3828, 'eval_samples_per_second': 104.487, 'eval_steps_per_second': 13.061, 'epoch': 3.0}


In [17]:
# 9. 모델 저장
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

('./fine-tuned-model/tokenizer_config.json',
 './fine-tuned-model/special_tokens_map.json',
 './fine-tuned-model/sentencepiece.bpe.model',
 './fine-tuned-model/added_tokens.json',
 './fine-tuned-model/tokenizer.json')

In [18]:
# 학습 결과와 평가 메트릭을 출력
print("Training completed and model saved.")

Training completed and model saved.
