## finetuning
#### huggingface上にあるLLMをinstallしてきてfinetuning

In [1]:
## データセットの準備
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from accelerate import Accelerator
from tqdm import tqdm
from utils import validate_and_correct_tnm_output
import pandas as pd
import bitsandbytes as bnb
from unsloth import FastLanguageModel 
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
# Get LAION dataset
# url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
# dataset = load_dataset("json", data_files = {"train" : url}, split = "train")


# GPUの確認
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# モデルとトークナイザーのロード
model_name ="google/gemma-2-2b-jpn-it"#"tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1"# 


def prepare_dataset(jsonl_file_path):
    data = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            entry = json.loads(line.strip())
            data.append({
                "prompt": entry["prompt"],
                "completion": entry["completion"],
                "id": entry["id"]
            })
    return Dataset.from_list(data)

# データセットのロード
jsonl_file_path_train = "../finetune_jsons/finetune_dataset_train.jsonl"
dataset_train = prepare_dataset(jsonl_file_path_train)
jsonl_file_path_val = "../finetune_jsons/finetune_dataset_val.jsonl"
dataset_val = prepare_dataset(jsonl_file_path_val)

with open('../tnm_prompt.txt', 'r', encoding='utf-8') as file:
    tnm_prompt_text = file.read()

tnm_prompt = (
    "あなたは優秀な医師です。以下の文章に基づき肺癌に関して常に正しい判断ができます。"
    "以下の進行度分類に基づき、与えられた文章からTNM分類を選んでください。\n\n"
    f"{tnm_prompt_text}\n\n"
    "以下の文章を読んで、TNM分類を正確に選択し、必ず以下の形式で出力してください。TとN,NとMの間には半角スペースを挿入しそれ以外は何も出力しないでください。\n"
    "T<number>[optional_letter] N<number>[optional_letter] M<number>[optional_letter]\n\n"
)


def preprocess_function(examples):
    model_inputs = {}
    inputs = [f"<start_of_turn>user{tnm_prompt} {prompt} <end_of_turn>\n<start_of_turn>model {completion} <end_of_turn>" for prompt,completion in zip(examples['prompt'], examples['completion'])]
    questions = [f"<start_of_turn>user{tnm_prompt} {prompt} <end_of_turn>\n<start_of_turn>model " for prompt in examples['prompt']]

    ids = [id for id in examples['id']]
    # 元のテキストも保持
    model_inputs['text'] = inputs
    model_inputs["id"] = ids
    model_inputs["question"] = questions
    
    return model_inputs

# 元のテキストを保持しつつデータセットを生成
train_dataset = dataset_train.map(preprocess_function, batched=True).remove_columns(["prompt", "completion","id","question"]) 
eval_dataset = dataset_val.map(preprocess_function, batched=True)#.remove_columns(["prompt", "completion","id"]) 

##sample output
print(train_dataset[0])


This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Using device: cuda


Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

{'text': '<start_of_turn>userあなたは優秀な医師です。以下の文章に基づき肺癌に関して常に正しい判断ができます。以下の進行度分類に基づき、与えられた文章からTNM分類を選んでください。\n\nT0\t原発腫瘍を認めない\nTis\t上皮内癌（carcinoma in situ）：肺野型の場合は、充実成分径0cmかつ病変全体径≦3cm\nT1\t腫瘍の充実成分径≦3cm、肺または臓側胸膜に覆われている、葉気管支より中枢への浸潤が気管支鏡上認められない（すなわち主気管支に及んでいない）\nT1mi\u3000微少浸潤性腺癌：部分充実型を示し、充実成分径≦0.5cmかつ病変全体径≦3cm \nT1a\t充実成分径≦1cmでかつTis・T1miには相当しない\nT1b\t充実成分径>1cmでかつ≦2cm\nT1c\t充実成分径>2cmでかつ≦3cm\nT2\t充実成分径>3cmでかつ≦5cm、または充実成分径≦3cmでも以下のいずれかであるもの\n\u3000\u3000主気管支に及ぶが気管分岐部には及ばない\n\u3000\u3000臓側胸膜に浸潤\n\u3000\u3000肺門まで連続する部分的または一側全体の無気肺か閉塞性肺炎がある\nT2a\t充実成分径>3cmでかつ≦4cm\nT2b\t充実成分径>4cmでかつ≦5cm\nT3\t充実成分径>5cmでかつ≦7cm、または充実成分径≦5cmでも以下のいずれかであるもの\n\u3000\u3000壁側胸膜、胸壁（superior sulcus tumorを含む）、横隔神経、心膜のいずれかに直接浸潤\n\u3000\u3000同一葉内の不連続な副腫瘍結節\nT4\t充実成分径＞7cm、または大きさを問わず横隔膜、縦隔、心臓、大血管、気管、反回神経、食道、椎体、気管分岐部への浸潤、あるいは同側の異なった肺葉内の副腫瘍結節\n\n\nN0\t所属リンパ節転移なし\nN1\tがんのある肺と同じ側の気管支周囲かつ/または同じ側の肺門、肺内のリンパ節への転移がある\nN2\tがんのある肺と同じ側の縦隔かつ/または気管分岐部より下のリンパ節への転移がある\nN3\tがんのある肺と反対側の縦隔、肺門、同じ側あるいは反対側の前斜角筋（首の筋肉）、鎖骨上窩（鎖骨の上のくぼみ）のリンパ節への転移がある\n\n\nM

In [2]:
## inferenceのコード

# 推論関数の定義
def inference(dataset, model, tokenizer, max_length=2048):

    results = []
    for data in tqdm(eval_dataset):
        # 入力テキストを取得
        input_text = data["question"]
        
        # トークナイズ (モデルが理解できる形式に変換)
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
        # モデルの推論
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                max_new_tokens=10,
                early_stopping=True,  # 早期終了を有効化
                eos_token_id=tokenizer.eos_token_id,  # EOSトークンを設定
                do_sample=False# トークンの確率質量
            )
        input_length = len(inputs["input_ids"][0])
        
        # デコード (トークン列をテキストに変換)
        output_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
        
        # 結果を保存
        results.append({
            "id": data["id"],
            "input": input_text,
            "generated_output": output_text,
            "expected_output": data["completion"]
        })
    return results

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,attn_implementation="eager", add_eos_token=True,)
# パディングトークンが設定されていない場合、EOSトークンを設定
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
## finetune前の確認
# モデルの推論モード設定
FastLanguageModel.for_inference(model)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.11.10: Fast Gemma2 patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.425 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear4bit(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear4bit(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layern

In [5]:

# 推論実行
inference_results = inference(eval_dataset, model, tokenizer)

output_csv = f'../model_outputs/submission_{os.path.basename(model_name)}_pretrained.csv'
results = []
# 結果を表示
for i, result in enumerate(inference_results):

    try:
        tnm_stage = result['generated_output']
        # TNM分類の形式を検証・修正
        tnm_stage = validate_and_correct_tnm_output(tnm_stage)

        # TNM分類を分割
        tnm_parts = tnm_stage.split()
        if len(tnm_parts) >= 3:
            results.append({
                "id": result['id'],
                "t": tnm_parts[0],
                "n": tnm_parts[1],
                "m": tnm_parts[2],
            })
        else:
            print(f"ファイル {result['id']} のTNM分類の出力形式が正しくありません: {tnm_stage}")
    except Exception as e:
        print(f"エラーが発生しました: {result['id']} - {e}")
        continue
# DataFrameに変換してCSVファイルに保存
results_df = pd.DataFrame(results)
#final_df = pd.concat([df, results_df], ignore_index=True)
results_df.to_csv(output_csv, index=False)

print(f"結果が{output_csv}に保存されました！")


  0%|          | 0/54 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
AUTOTUNE bmm(8x932x256, 8x256x932)
  bmm 0.0729 ms 100.0% 
  triton_bmm_9 0.0860 ms 84.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_10 0.0860 ms 84.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_13 0.0870 ms 83.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_14 0.0870 ms 83.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B

警告: 出力形式が不正です。修正します: T4 N2 M2
結果が../model_outputs/submission_gemma-2-2b-jpn-it_pretrained.csvに保存されました！





In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

FastLanguageModel.for_training(model)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    label_pad_token_id=-100,
    padding=True,  # 動的パディングを有効にする
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_text_field = "text",
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "../finetuned_model/swallow70b",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user",  # インストラクション部分を指定
    response_part="<start_of_turn>model"  # レスポンス部分を指定
)


Unsloth: Already have LoRA adapters! We shall skip this step.


Map (num_proc=2):   0%|          | 0/108 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 108 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 20,766,720


  0%|          | 0/30 [00:00<?, ?it/s]

AUTOTUNE bmm(16x912x256, 16x256x912)
  bmm 0.1065 ms 100.0% 
  triton_bmm_52 0.1731 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_48 0.1823 ms 58.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_43 0.1864 ms 57.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_44 0.1925 ms 55.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_51 0.2079 ms 51.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 

{'loss': 3.7611, 'grad_norm': 30.423423767089844, 'learning_rate': 4e-05, 'epoch': 0.07}
{'loss': 4.6505, 'grad_norm': 34.38920593261719, 'learning_rate': 8e-05, 'epoch': 0.15}
{'loss': 3.8344, 'grad_norm': 27.91519546508789, 'learning_rate': 0.00012, 'epoch': 0.22}
{'loss': 2.0216, 'grad_norm': 20.519309997558594, 'learning_rate': 0.00016, 'epoch': 0.3}
{'loss': 1.0725, 'grad_norm': 11.780780792236328, 'learning_rate': 0.0002, 'epoch': 0.37}
{'loss': 0.5595, 'grad_norm': 6.501875877380371, 'learning_rate': 0.000192, 'epoch': 0.44}
{'loss': 0.7463, 'grad_norm': 9.235169410705566, 'learning_rate': 0.00018400000000000003, 'epoch': 0.52}
{'loss': 0.2783, 'grad_norm': 1.8145989179611206, 'learning_rate': 0.00017600000000000002, 'epoch': 0.59}
{'loss': 0.4445, 'grad_norm': 3.174177646636963, 'learning_rate': 0.000168, 'epoch': 0.67}
{'loss': 0.3669, 'grad_norm': 1.3657704591751099, 'learning_rate': 0.00016, 'epoch': 0.74}
{'loss': 0.2266, 'grad_norm': 1.1856670379638672, 'learning_rate': 0.

In [10]:

# モデルとトークナイザのロード
# model_path = "../finetuned_model/checkpoint-300"  # トレーニング済みモデルが保存されているディレクトリ
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     max_seq_length = max_seq_length,
#     dtype = None,
#     load_in_4bit = True,
# )

# モデルをデバイスに移動
#model = model.to("cuda" if torch.cuda.is_available() else "cpu")
FastLanguageModel.for_inference(model)
# 推論実行
inference_results = inference(eval_dataset, model, tokenizer)

100%|██████████| 54/54 [00:20<00:00,  2.69it/s]


In [11]:

# モデルとトークナイザのロード
# model_path = "../finetuned_model/checkpoint-300"  # トレーニング済みモデルが保存されているディレクトリ
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     max_seq_length = max_seq_length,
#     dtype = None,
#     load_in_4bit = True,
# )


output_csv = f'../model_outputs/submission_{os.path.basename(model_name)}_finetuned.csv'
results = []
# 結果を表示
for i, result in enumerate(inference_results):
    try:
        tnm_stage = result['generated_output']
        # TNM分類の形式を検証・修正
        tnm_stage = validate_and_correct_tnm_output(tnm_stage)

        # TNM分類を分割
        tnm_parts = tnm_stage.split()
        if len(tnm_parts) >= 3:
            results.append({
                "id": result['id'],
                "t": tnm_parts[0],
                "n": tnm_parts[1],
                "m": tnm_parts[2],
            })
        else:
            print(f"ファイル {result['id']} のTNM分類の出力形式が正しくありません: {tnm_stage}")
    except Exception as e:
        print(f"エラーが発生しました: {result['id']} - {e}")
        continue
# DataFrameに変換してCSVファイルに保存
results_df = pd.DataFrame(results)
#final_df = pd.concat([df, results_df], ignore_index=True)
results_df.to_csv(output_csv, index=False)

print(f"結果が{output_csv}に保存されました！")
    


結果が../model_outputs/submission_gemma-2-2b-jpn-it_finetuned.csvに保存されました！
