In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024
dtype = torch.float16
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Saxo/Linkbricks-Horizon-AI-Korean-llama-3.1-sft-dpo-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    device_map = "auto",
    # trust_remote_code=True
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.6: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Device supports bfloat16 but you selected float16. Will change to bfloat16.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Saxo/Linkbricks-Horizon-AI-Korean-llama-3.1-sft-dpo-8B does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

Unsloth 2025.1.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
from datasets import Dataset
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')
dataset = Dataset.from_pandas(train)
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [5]:
def create_prompt(input, output):
    prompt = (
        "You are a helpful assistant specializing in restoring obfuscated Korean reviews. "
        "Your task is to transform the given obfuscated Korean review into a clear, correct, "
        "and natural-sounding Korean review that reflects its original meaning. "
        "Below is an example of an obfuscated Korean review and its restored form:\n\n"
        f"Input: {input}\n"
        f"Output: {output}\n"
    )
    return prompt

def format_chat_template(row):
    prompt = create_prompt(row["input"], row["output"])
    tokens = tokenizer.encode(prompt, truncation=True, max_length=512)
    row["input_ids"] = tokens
    return row


In [6]:
train_dataset = train_dataset.map(format_chat_template, batched=False, num_proc=4)
test_dataset = test_dataset.map(format_chat_template, batched=False, num_proc=4)

Map (num_proc=4):   0%|          | 0/10136 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1127 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset= test_dataset,
    formatting_func = lambda x: x["input_ids"],
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        gradient_accumulation_steps = 8,
        eval_strategy="steps",
        eval_steps=50,
        warmup_steps = 10,
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 200,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 50,
        optim = "paged_adamw_32bit",
        # weight_decay = 0.01,
        # lr_scheduler_type = "linear",
        seed = 3407,
        group_by_length=True,
        output_dir = "outputs",
        report_to = "none",
    ),
)

In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,136 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 632
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,2.28,2.103369
100,1.8111,1.896138
150,1.6455,1.778842
200,1.5611,1.725237
250,1.5501,1.603247
300,1.5038,1.553254
350,1.6052,1.455233
400,1.385,1.440464
450,1.3745,1.420086
500,1.3569,1.407524


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead

In [9]:
ADAPTER_MODEL = "/content/drive/MyDrive/llama-3.1-8b-dacon"

trainer.model.save_pretrained(ADAPTER_MODEL)

# 모델 병합 및 허깅페이스 모델 업로드

In [10]:
model.save_pretrained_merged("model1", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("llama-3.1-8b-dacon", tokenizer, save_method = "merged_16bit", token = "hf_")

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 31.33 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 81%|████████▏ | 26/32 [00:01<00:00, 18.52it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:07<00:00,  4.44it/s]


Unsloth: Saving tokenizer... Done.
Done.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 31.33 out of 52.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:06<00:00,  4.62it/s]


Unsloth: Saving to organization with address bigdefence/llama-3.1-8b-dacon
Unsloth: Saving tokenizer... Done.
Unsloth: Saving to organization with address bigdefence/llama-3.1-8b-dacon
Unsloth: Uploading all files... Please wait...


  0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/llama-3.1-8b-dacon


# 추론

In [35]:
import pandas as pd

samples = []
train = pd.read_csv('/content/drive/MyDrive/train.csv')
test = pd.read_csv('/content/drive/MyDrive/test.csv')

for i in range(10):
    sample = f"input : {train['input'][i]} \n output : {train['output'][i]}"
    samples.append(sample)

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 모델과 토크나이저 불러오기
model_id = "bigdefence/llama-3.1-8b-dacon"  # 모델 ID를 여기에 입력
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token  # pad_token 설정
tokenizer.padding_side = 'right'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [5]:
def remove_repeated_phrases(text):
    phrases = text.split(" ")
    seen = set()
    result = []
    for phrase in phrases:
        if phrase not in seen:
            result.append(phrase)
            seen.add(phrase)
    result[0] = result[0].replace("model\n", "")
    return " ".join(result)

In [41]:
restored_reviews = []

for index, row in test.iterrows():
    query = row['input']
    system_prompt = f"You are a helpful assistant specializing in restoring obfuscated Korean reviews. \
Your task is to transform the given obfuscated Korean review into a clear, correct,\
and natural-sounding Korean review that reflects its original meaning.\
Below are examples of obfuscated Korean reviews and their restored forms:\n\n \
Example, {samples[0]} \n {samples[1]} \n {samples[2]} \n {samples[3]} \n {samples[4]} \
Spacing and word length in the output must be restored to the same as in the input.\
Do not provide any description. Print only in Korean."

    messages = [
        {"role": "user", "content": '{}\nInput:{}\nOutput:'.format(system_prompt, query)}
    ]

    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")

    outputs = model.generate(
        **input_ids,
        max_new_tokens=len(query),
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleanup_keywords = ["Input:", "input:", "Output:", "output:","{",'Example','example']
    result = generated_text.split("assistant")[-1].strip()

# Remove any unwanted keywords from the result
    for keyword in cleanup_keywords:
        if keyword in result:
            result = result.split(keyword)[0].strip()
        result = remove_repeated_phrases(result)

    restored_reviews.append(result)
    print(index,result)



0 너무너무 만족스러운 호텔이에요. 부산에 오면 꼭 추천하고 싶은 곳이에요. 최고입니다! ㅎㅎ 다음에 또 올 것 같아요.
1 프론트가 없고, 조식도 없으며, 일반 입주민들이 사이트에 있어 호텔점처럼 관리가 잘 안 되는 느낌이네요. 가성비가 떨어져요.
2 진차 불친절해요. 살면서 머물렀던 호텔 중에 최악이었습니다. 직원이진 사장인지 체크인할 때부터 친절함 1도 없었구요. 3인 예약하고 추가 금액까지 결제해하는데 수건 추가요청드리니 큰정상 4개밖에 제공이 안 되다네요. 2시간 정도 웨스트하고 다시 들어갈 때 몇 호신나, 2분 예약하지 않았나 물어보셨어요(체크인할 때와 같은 분). 추가금 결제했고 갔을 거제해 넷에 문장도 보여드렸는데도 갔다버흠 번인지 걸쳐한 시간, 분, 초까지 물어보시더라구요. 개처한 엉수중 직원 짱짱신 다음, 호수 잠옷 줘겨다 한마디 하셨어요. 싸가지 한 마디 없으셨구요. 옆이까 없고 상당히 불쾌했습니다. 그리고 냉장고에서 생선 비린내 나요. 물에서도 비린맛 나요…
3 뷰 맛집~~ 그런데 방음이 미흡하네요. 층간 소음과 파고니가 잇중장이 아니라서 밤에 파도 소리, 아침에 깜맘기인지 계속 울어서 잠을 못 잤어요ㅠ 뷰는 너무 좋아요~~~
4 방 상태는 진짜 폐휴 지점인데 전망은 좋아요. 보일러가 아주 찬자하게 돌아서 추웠어요. 화장실에 비닐만 있어서 샴푸 같은 거 다 가지고 왔어 해요.
5 광안대교 뷰 하나만 보고 갔는데, 객실도 넓고 나름 깨끗했습니다! 체크인할 때 일부 해운대해변이 있었는데, 직원분이 친절하게 설명해주셨습니다! 주차를 할 수 있다는 점은 좋았지만, 주차 관리인이 불친절해서 아쉽습니다.
6 광안대교가 잘 보여 숙소에서 편안하게 뷰를 감상할 수 있어서 좋았습니다. 객실이 전체적으로 화이트 톤이라서 사진을 찍으면 더 화사하게 나오고, 입구에서부터 신발을 빼고 들어가서 더욱더 청결한 느낌을 받았습니다. 화장실, 샤워실, 세면대가 각각 분리되어 있는 것도 인상 깊었습니다. 직원분들도 친절하시고, 웰컴 키트, 조식 꿀밥이 기대도 감사합니다!

KeyboardInterrupt: 

In [None]:
# 결과 저장
submission = pd.DataFrame({
    "input": test_data["input"],
    "output": restored_reviews
})
submission.to_csv('./submission.csv', index=False, encoding='utf-8-sig')

print("추론이 완료되었습니다. 결과가 submission.csv에 저장되었습니다.")

In [None]:
submission = pd.read_csv('./sample_submission.csv', encoding = 'utf-8-sig')

In [None]:
submission['output'] = restored_reviews

In [None]:
submission.to_csv('./submission.csv', index = False, encoding = 'utf-8-sig')