## finetuning
#### huggingface上にあるLLMをinstallしてきてfinetuning

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from accelerate import Accelerator
from tqdm import tqdm
from utils import validate_and_correct_tnm_output
import pandas as pd
import bitsandbytes as bnb
from unsloth import FastLanguageModel 
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from utils import validate_and_correct_sub_output


This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
## データセットの準備
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from accelerate import Accelerator
from tqdm import tqdm
from utils import validate_and_correct_tnm_output
import pandas as pd
import bitsandbytes as bnb
from unsloth import FastLanguageModel 
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from utils import validate_and_correct_sub_output
max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
# Get LAION dataset
# url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
# dataset = load_dataset("json", data_files = {"train" : url}, split = "train")


# GPUの確認
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# モデルとトークナイザーのロード
model_name ="tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.1"
#"google/gemma-2-2b-jpn-it""pfnet/Llama3-Preferred-MedSwallow-70B"# 

def prepare_dataset(dataset_train,label_train):
    data = []
    grouped_text = dataset_train.groupby('id')['text'].apply(' '.join)
    data_df = dataset_train.merge(grouped_text, on='id', how='left', suffixes=('', '_all'))
    data_df = data_df.merge(label_train, on=['id','sentence_index'], how='left')
    for n in range(len(data_df)):
        entry = data_df.iloc[n]
        data.append({
            "prompt": entry["text"],
            "context": entry["text_all"],
            "completion": ' '.join(entry[["omittable","measure","extension","atelectasis","satellite","lymphadenopathy","pleural","distant"]].astype('str').to_list()),
            "id": entry["id"],
            "sentence_index": entry["sentence_index"]
        })
    return Dataset.from_list(data)

# データセットのロード
# sentences.csv を読み込み
sentences_path = '../radnlp_2024_train_val_test/ja/sub_task/train/sentences.csv'
dataset_train = pd.read_csv(sentences_path)
sentences_path = '../radnlp_2024_train_val_test/ja/sub_task/val/sentences.csv'
dataset_val = pd.read_csv(sentences_path)

label_path = '../radnlp_2024_train_val_test/ja/sub_task/train/label.csv'
label_train = pd.read_csv(label_path)
label_path = '../radnlp_2024_train_val_test/ja/sub_task/val/label.csv'
label_val = pd.read_csv(label_path)

dataset_train = prepare_dataset(dataset_train,label_train)
dataset_val = prepare_dataset(dataset_val,label_val)

with open('../subtask_prompt.txt', 'r', encoding='utf-8') as file:
    subtask_prompt_text = file.read()

def generate_sub_prompt(subtask_prompt_text, context, sentence):
    return (
        "あなたは優秀な医師です。以下の文章に基づき肺癌に関して常に正しい判断ができます。"
        "以下の文章全体を考慮し、指定された対象文について、以下のどのラベルに該当するかを判断してください。\n\n"
        f"{subtask_prompt_text}\n\n"
        f"文章全体:\n{context}\n\n"
        f"対象文:\n{sentence}\n\n"
        "次のトピックに該当する場合は「1」、該当しない場合は「0」としてください。\n"
        "1. omittable\n"
        "2. measure\n"
        "3. extension\n"
        "4. atelectasis\n"
        "5. satellite\n"
        "6. lymphadenopathy\n"
        "7. pleural\n"
        "8. distant\n\n"
        "出力形式: 0または1をスペース区切りで出力してください。\n"
        "例: 1 0 0 0 0 0 0 0\n"
        "出力："
    )


def preprocess_function(examples):
    model_inputs = {}
    inputs = [
    f"<start_of_turn>user{generate_sub_prompt(subtask_prompt_text, context, prompt)} <end_of_turn>\n"
    f"<start_of_turn>model {completion} <end_of_turn>"
    for prompt, context, completion in zip(examples['prompt'], examples['context'], examples['completion'])]
    
    questions = [
    f"<start_of_turn>user{generate_sub_prompt(subtask_prompt_text, context, prompt)} <end_of_turn>\n"
    f"<start_of_turn>model "
    for prompt, context in zip(examples['prompt'], examples['context'])]

    ids = [id for id in examples['id']]
    # 元のテキストも保持
    model_inputs['text'] = inputs
    model_inputs["id"] = ids
    model_inputs["question"] = questions
    
    return model_inputs

# 元のテキストを保持しつつデータセットを生成
train_dataset = dataset_train.map(preprocess_function, batched=True).remove_columns(["prompt", "completion","id","question","context","sentence_index"]) 
eval_dataset = dataset_val.map(preprocess_function, batched=True)#.remove_columns(["prompt", "completion","id"]) 

##sample output
print(train_dataset[0])


Using device: cuda


Map:   0%|          | 0/1020 [00:00<?, ? examples/s]

Map:   0%|          | 0/451 [00:00<?, ? examples/s]

{'text': '<start_of_turn>userあなたは優秀な医師です。以下の文章に基づき肺癌に関して常に正しい判断ができます。以下の文章全体を考慮し、指定された対象文について、以下のどのラベルに該当するかを判断してください。\n\n(i) Omttable - 明らかに，陰性所見のみであるか，または肺癌のステージと無関係な内容である区間。ここでの「明らかに」とは，肺癌ステージング基準についての知識を用いなくても容易に判定できる程度を想定しています。\n(ii) Measure - Omittable に該当せず，かつ主に原発巣の大きさについて述べている区間。\n(iii) Estension - Omittable に該当せず，かつ原発巣の進展範囲について述べている区間。\n(iv) Atelectasis - Omittable に該当せず，かつ無気肺もしくは閉塞性肺炎を示している区間。\n(v) Satelliite - Omittable に該当せず，かつ原発巣とは離れた肺内悪性病変 (肺内転移や癌性リンパ管症など) を示している区間。\n(vi) Lymphadenopathy - Omittable に該当せず，かつ領域リンパ節腫大を示している区間。\n(vii) Pleural - Omittable に該当せず，かつ胸水，心嚢水，胸膜播種，心膜播種を示している区間。\n(viii) Distant - Omittable に該当せず，かつ遠隔転移を示している区間。\n\n文章全体:\n左上葉全体が無気肺になっています。 左上葉気管支は閉塞して造影  CT  で増強効果の乏しい 74mm  の腫瘤があります。 肺癌と考えます。 左肺門、同側縦隔リンパ節腫大しリンパ節転移と考えます。 気管右側にもリンパ節腫大があり、こちらもリンパ節転移を疑います。 左下葉気管支も腫瘍により浸潤あり、狭窄しています。 胸水貯留は認めません。 撮影範囲の腹部臓器に粗大な異常を認めません。\n\n対象文:\n左上葉全体が無気肺になっています。\n\n次のトピックに該当する場合は「1」、該当しない場合は「0」としてください。\n1. omittable\n2. measure\n3. extension\n4. atelectasis\n5. satellite\n6

In [3]:
## inferenceのコード

# 推論関数の定義
def inference(dataset, model, tokenizer, max_length=2048):

    results = []
    for data in tqdm(eval_dataset):
        # 入力テキストを取得
        input_text = data["question"]
        
        # トークナイズ (モデルが理解できる形式に変換)
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
        # モデルの推論
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                max_new_tokens=20,
                early_stopping=True,  # 早期終了を有効化
                eos_token_id=tokenizer.eos_token_id,  # EOSトークンを設定
                do_sample=False# トークンの確率質量
            )
        input_length = len(inputs["input_ids"][0])
        
        # デコード (トークン列をテキストに変換)
        output_text = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
        
        # 結果を保存
        results.append({
            "id": data["id"],
            "sentence_index": data["sentence_index"],
            "input": input_text,
            "generated_output": output_text,
            "expected_output": data["completion"]
        })
    return results

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,attn_implementation="eager", add_eos_token=True,)
# パディングトークンが設定されていない場合、EOSトークンを設定
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
## finetune前の確認
# モデルの推論モード設定
FastLanguageModel.for_inference(model)

`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192


==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.425 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): L

In [5]:
# 推論実行
inference_results = inference(eval_dataset, model, tokenizer)

output_csv = f'../model_outputs/sentence_submission_{os.path.basename(model_name)}_pretrained.csv'
results = []
# 結果を表示
for i, result in enumerate(inference_results):

    try:
        output = result['generated_output']
        # TNM分類の形式を検証・修正
        output = validate_and_correct_sub_output(output)

        predictions = list(map(int, output.split()))
                # 結果をリストに保存
        results.append({
            "id": result['id'],
            "sentence_index": result['sentence_index'],
            "omittable": predictions[0],
            "measure": predictions[1],
            "extension": predictions[2],
            "atelectasis": predictions[3],
            "satellite": predictions[4],
            "lymphadenopathy": predictions[5],
            "pleural": predictions[6],
            "distant": predictions[7],
        })
    except Exception as e:
        print(f"エラーが発生しました: ID={result['id']}, sentence_index={result['sentence_index']} - {e}")
        continue
# DataFrameに変換してCSVファイルに保存
results_df = pd.DataFrame(results)
#final_df = pd.concat([df, results_df], ignore_index=True)
results_df.to_csv(output_csv, index=False)

print(f"結果が{output_csv}に保存されました！")


100%|██████████| 451/451 [06:18<00:00,  1.19it/s]

結果が../model_outputs/sentence_submission_Llama-3.1-Swallow-8B-Instruct-v0.1_pretrained.csvに保存されました！





In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

FastLanguageModel.for_training(model)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    label_pad_token_id=-100,
    padding=True,  # 動的パディングを有効にする
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    max_seq_length = max_seq_length,
    data_collator = data_collator,
    dataset_text_field = "text",
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 90,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "../finetuned_model/swallow8b",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user",  # インストラクション部分を指定
    response_part="<start_of_turn>model"  # レスポンス部分を指定
)


Unsloth 2024.11.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map (num_proc=2):   0%|          | 0/1020 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Map:   0%|          | 0/1020 [00:00<?, ? examples/s]

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,020 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 90
 "-____-"     Number of trainable parameters = 41,943,040


  0%|          | 0/90 [00:00<?, ?it/s]

{'loss': 0.3708, 'grad_norm': 1.2545955181121826, 'learning_rate': 4e-05, 'epoch': 0.01}
{'loss': 0.3236, 'grad_norm': 1.0530649423599243, 'learning_rate': 8e-05, 'epoch': 0.02}
{'loss': 0.3388, 'grad_norm': 1.117310881614685, 'learning_rate': 0.00012, 'epoch': 0.02}
{'loss': 0.2086, 'grad_norm': 1.027769684791565, 'learning_rate': 0.00016, 'epoch': 0.03}
{'loss': 0.1833, 'grad_norm': 0.8655968308448792, 'learning_rate': 0.0002, 'epoch': 0.04}
{'loss': 0.1, 'grad_norm': 0.6234148144721985, 'learning_rate': 0.00019764705882352942, 'epoch': 0.05}
{'loss': 0.0982, 'grad_norm': 0.45092466473579407, 'learning_rate': 0.00019529411764705883, 'epoch': 0.05}
{'loss': 0.0891, 'grad_norm': 0.8836435675621033, 'learning_rate': 0.00019294117647058825, 'epoch': 0.06}
{'loss': 0.0594, 'grad_norm': 0.3325469195842743, 'learning_rate': 0.00019058823529411766, 'epoch': 0.07}
{'loss': 0.0616, 'grad_norm': 0.25960540771484375, 'learning_rate': 0.00018823529411764707, 'epoch': 0.08}
{'loss': 0.1284, 'grad_

`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192


{'train_runtime': 568.7856, 'train_samples_per_second': 1.266, 'train_steps_per_second': 0.158, 'train_loss': 0.054532570640246074, 'epoch': 0.71}


In [8]:

# モデルとトークナイザのロード
# model_path = "../finetuned_model/checkpoint-300"  # トレーニング済みモデルが保存されているディレクトリ
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     max_seq_length = max_seq_length,
#     dtype = None,
#     load_in_4bit = True,
# )

# モデルをデバイスに移動
#model = model.to("cuda" if torch.cuda.is_available() else "cpu")
FastLanguageModel.for_inference(model)
# 推論実行
inference_results = inference(eval_dataset, model, tokenizer)

100%|██████████| 451/451 [07:14<00:00,  1.04it/s]


In [9]:

# モデルとトークナイザのロード
# model_path = "../finetuned_model/checkpoint-300"  # トレーニング済みモデルが保存されているディレクトリ
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     max_seq_length = max_seq_length,
#     dtype = None,
#     load_in_4bit = True,
# )

output_csv = f'../model_outputs/sentence_submission_{os.path.basename(model_name)}_finetuned.csv'
results = []
# 結果を表示
for i, result in enumerate(inference_results):

    try:
        output = result['generated_output']
        # TNM分類の形式を検証・修正
        output = validate_and_correct_sub_output(output)

        predictions = list(map(int, output.split()))
                # 結果をリストに保存
        results.append({
            "id": result['id'],
            "sentence_index": result['sentence_index'],
            "omittable": predictions[0],
            "measure": predictions[1],
            "extension": predictions[2],
            "atelectasis": predictions[3],
            "satellite": predictions[4],
            "lymphadenopathy": predictions[5],
            "pleural": predictions[6],
            "distant": predictions[7],
        })
    except Exception as e:
        print(f"エラーが発生しました: ID={result['id']}, sentence_index={result['sentence_index']} - {e}")
        continue
# DataFrameに変換してCSVファイルに保存
results_df = pd.DataFrame(results)
#final_df = pd.concat([df, results_df], ignore_index=True)
results_df.to_csv(output_csv, index=False)

print(f"結果が{output_csv}に保存されました！")


結果が../model_outputs/sentence_submission_Llama-3.1-Swallow-8B-Instruct-v0.1_finetuned.csvに保存されました！
