## finetuning
#### huggingface上にあるLLMをinstallしてきてfinetuning

In [None]:
## データセットの準備
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from accelerate import Accelerator
from tqdm import tqdm


# GPUの確認
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# モデルとトークナイザーのロード
model_name = "pfnet/Llama3-Preferred-MedSwallow-70B"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def prepare_dataset(jsonl_file_path):
    data = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            entry = json.loads(line.strip())
            data.append({
                "prompt": entry["prompt"],
                "completion": entry["completion"]
            })
    return Dataset.from_list(data)

# データセットのロード
jsonl_file_path_train = "../finetune_jsons/finetune_dataset_train.jsonl"
dataset_train = prepare_dataset(jsonl_file_path_train)
jsonl_file_path_val = "../finetune_jsons/finetune_dataset_val.jsonl"
dataset_val = prepare_dataset(jsonl_file_path_val)

with open('../tnm_prompt.txt', 'r', encoding='utf-8') as file:
    tnm_prompt_text = file.read()

tnm_prompt_text_base = (
    "あなたは優秀な医師です。以下の文章に基づき肺癌に関して常に正しい判断ができます。"
    "進行度分類は以下のTNM第８版に準拠しています。何も言わずに以下を覚え、与えられた文章からよく考えてTNM分類を選んでください。\n\n"
)
tnm_prompt = (
    f"{tnm_prompt_text_base}\n\n"
    f"{tnm_prompt_text}\n\n"
    "以下の文章を読んで、TNM分類を正確に選択し、必ず以下の形式で出力してください：\n"
    "T<number>[optional_letter] N<number>[optional_letter] M<number>[optional_letter]\n\n"
)
tokenizer.pad_token = tokenizer.eos_token

    

def preprocess_function(examples):
    inputs = [f"{tnm_prompt}{prompt} 出力：" for prompt in examples['prompt']]
    targets = [completion for completion in examples['completion']]
    
    # トークナイズ (入力)
    model_inputs = tokenizer(inputs, truncation=True, padding=False)
    
    # トークナイズ (ラベル)
    labels = tokenizer(targets, truncation=True, padding=False)
    
    # -100 をラベルのパディングに設定
    labels["input_ids"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    
    # 元のテキストも保持
    model_inputs["original_inputs"] = inputs
    model_inputs["original_targets"] = targets
    model_inputs["text"] = inputs
    
    return model_inputs

# 元のテキストを保持しつつデータセットを生成
train_dataset = dataset_train.map(preprocess_function, batched=True)
eval_dataset = dataset_val.map(preprocess_function, batched=True)



##sample output
print(train_dataset[0])




Using device: cuda


Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/54 [00:00<?, ? examples/s]

{'prompt': '左肺上葉に長径 12cm の腫瘤を認めます。弓部大動脈や肺動脈浸潤が疑われます。\nT4 と考えます。\n左肺門や縦隔リンパ節#6 の腫大が見られます。転移が疑われます。N2 と考えます。\n胸水は認めません。\n左第 3 肋骨は腫瘤による骨破壊が見られます。他撮像範囲の骨に遠隔転移を疑う所見は指摘できません。\n撮像範囲の腹部に有意所見は指摘できません。', 'completion': 'T4 N2 M0', 'input_ids': [128000, 30591, 112568, 15682, 104622, 106241, 26854, 103242, 104599, 38641, 1811, 88852, 16144, 83125, 20230, 75146, 109669, 50834, 57942, 118, 23706, 234, 114556, 39926, 40053, 20230, 37656, 102800, 122225, 29295, 125545, 1811, 116748, 27479, 17620, 104770, 15682, 88852, 16144, 30271, 44, 30537, 66115, 41401, 20230, 107253, 26955, 254, 127145, 1811, 99849, 32977, 117952, 101860, 20230, 88852, 30512, 112854, 58942, 5486, 58318, 58942, 109115, 83125, 55031, 114652, 78698, 107290, 30271, 44, 17620, 104770, 30512, 102647, 104127, 72315, 19066, 51, 15, 197, 53229, 102404, 101921, 104, 114431, 235, 30512, 104736, 62004, 100604, 198, 51, 285, 197, 17905, 105871, 32943, 100204, 107106, 104328, 100604, 29295, 25827, 10110, 7063, 18595, 7942, 304, 10109, 123407, 57

In [None]:
## inferenceのコード

# 推論関数の定義
def inference(dataset, model, tokenizer, max_length=2048):
    results = []
    for data in tqdm(dataset):
        # 入力テキストを取得
        input_text = data["original_inputs"]
        
        # トークナイズ (モデルが理解できる形式に変換)
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(model.device)
        
        # モデルの推論
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                max_length=max_length,  # 最大出力長
                early_stopping=True,  # 早期終了を有効化
                eos_token_id=tokenizer.eos_token_id,  # EOSトークンを設定
                do_sample=True,  # サンプリングを有効化
                top_k=50,  # 上位kトークンから選択
                top_p=0.95  # トークンの確率質量
            )

        
        # デコード (トークン列をテキストに変換)
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # 結果を保存
        results.append({
            "input": input_text,
            "generated_output": output_text,
            "expected_output": data["original_targets"]
        })
    return results

In [2]:
from unsloth import FastLanguageModel 
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
# Get LAION dataset
# url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
# dataset = load_dataset("json", data_files = {"train" : url}, split = "train")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.425 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards:   0%|          | 0/30 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

pfnet/Llama3-Preferred-MedSwallow-70B does not have a padding token! Will use pad_token = <|reserved_special_token_250|>.


In [None]:
## finetune前の確認

# 推論実行
inference_results = inference(eval_dataset[:3], model, tokenizer)

# 結果を表示
for i, result in enumerate(inference_results):
    print(f"Sample {i+1}:")
    print(f"Input: {result['input']}")
    print(f"Generated Output: {result['generated_output']}")
    print(f"Expected Output: {result['expected_output']}")
    print("----")

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    label_pad_token_id=-100,
    padding=True,  # 動的パディングを有効にする
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="あなたは優秀な医師です。以下の文章に基づき肺癌に関して常に正しい判断ができます。",  # インストラクション部分を指定
    response_part="出力："  # レスポンス部分を指定
)


Unsloth 2024.11.10 patched 80 layers with 80 QKV layers, 80 O layers and 80 MLP layers.


In [5]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 108 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 207,093,760


  0%|          | 0/60 [00:00<?, ?it/s]

{'loss': 1.8381, 'grad_norm': 0.4072749614715576, 'learning_rate': 4e-05, 'epoch': 0.07}
{'loss': 1.7384, 'grad_norm': 0.36444002389907837, 'learning_rate': 8e-05, 'epoch': 0.15}
{'loss': 1.7724, 'grad_norm': 0.3491908013820648, 'learning_rate': 0.00012, 'epoch': 0.22}
{'loss': 1.7671, 'grad_norm': 0.5786504745483398, 'learning_rate': 0.00016, 'epoch': 0.3}
{'loss': 1.6427, 'grad_norm': 0.4702806770801544, 'learning_rate': 0.0002, 'epoch': 0.37}
{'loss': 1.5077, 'grad_norm': 0.48127204179763794, 'learning_rate': 0.00019636363636363636, 'epoch': 0.44}
{'loss': 1.3497, 'grad_norm': 0.6133865714073181, 'learning_rate': 0.00019272727272727274, 'epoch': 0.52}
{'loss': 1.349, 'grad_norm': 3.6691014766693115, 'learning_rate': 0.0001890909090909091, 'epoch': 0.59}
{'loss': 1.1671, 'grad_norm': 0.5333468914031982, 'learning_rate': 0.00018545454545454545, 'epoch': 0.67}
{'loss': 1.1274, 'grad_norm': 0.4787834882736206, 'learning_rate': 0.00018181818181818183, 'epoch': 0.74}
{'loss': 1.1379, 'gra

In [None]:
from transformers import AutoTokenizer
import torch
# モデルとトークナイザーのロード
tokenizer = AutoTokenizer.from_pretrained(model_name)
# モデルの推論モード設定
model = FastLanguageModel.for_inference(model)

In [None]:


# 推論実行
inference_results = inference(eval_dataset, model, tokenizer)

# 結果を表示
for i, result in enumerate(inference_results):
    print(f"Sample {i+1}:")
    print(f"Input: {result['input']}")
    print(f"Generated Output: {result['generated_output']}")
    print(f"Expected Output: {result['expected_output']}")
    print("----")
    
