In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
! pip install bitsandbytes wandb peft accelerate

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
import wandb

from transformers import AutoTokenizer, AdamW, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm


In [None]:
file_path = '/content/drive/MyDrive/open/train.csv'
train = pd.read_csv(file_path)

In [None]:
train['내용'] = '질문: ' + train['Question'] + '\n' + '답변:' + train['Answer']
data = pd.DataFrame(train['내용'], columns=['내용'])

In [None]:
data.head()

Unnamed: 0,내용
0,질문: 2024년 중앙정부 재정체계는 어떻게 구성되어 있나요?\n답변:2024년 중...
1,질문: 2024년 중앙정부의 예산 지출은 어떻게 구성되어 있나요?\n답변:2024년...
2,질문: 기금이 예산과 다른 점은?\n답변:기금은 예산과 구분되는 재정수단으로서 재정...
3,"질문: 일반회계, 특별회계, 기금 간의 차이점은 무엇인가요?\n답변:일반회계는 특정..."
4,"질문: 2024년 총수입은 얼마이며, 예산수입과 기금수입은 각각 몇 조원인가요?\n..."


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   내용      496 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB


In [None]:
model_id = "yanolja/EEVE-Korean-Instruct-10.8B-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id,  eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token


max_length = 512

formatted_data = []

for _, row in tqdm(data.iterrows()):
  input_text = row['내용']
  input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
  formatted_data.append(input_ids)
print('Done')

496it [00:00, 1704.20it/s]

Done





In [None]:
formatted_data = torch.cat(formatted_data, dim=0)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             #torch_dtype=torch.float32,

                                             )


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    r=16,
    lora_alpha=32,
    #target_modules=["query_key_value"],
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "gate_proj",
    "up_proj",
    "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 58982400 || all params: 5629415424 || trainable%: 1.0477535509022686


In [None]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=formatted_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=10,
        gradient_accumulation_steps=1,
      #  max_steps=50,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=500,
        output_dir=f"{model_id}_capstone_design",
        optim="paged_adamw_8bit",
        push_to_hub = True,
        num_train_epochs = 3
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss


TrainOutput(global_step=120, training_loss=1.0766277313232422, metrics={'train_runtime': 465.165, 'train_samples_per_second': 2.554, 'train_steps_per_second': 0.258, 'total_flos': 3.903592848791962e+16, 'train_loss': 1.0766277313232422, 'epoch': 3.0})