## Imports

In [1]:
from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, 
                          TrainingArguments, pipeline, logging)

from accelerate import Accelerator
from huggingface_hub import login
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer
import torch

accelerator = Accelerator()

  from .autonotebook import tqdm as notebook_tqdm
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Login to Hugging Face

In [2]:
login(token = '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')

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/anthony.rahbany/.cache/huggingface/token
Login successful


## Set Paths to Pretrained Model & Tokenizer

In [3]:
pretrained_model = "/blue/azare/anthony.rahbany/NLP/NLP_Cares/Code_Train/finetune1/LlamaModel/"
pretrained_tokenizer = "/blue/azare/anthony.rahbany/NLP/NLP_Cares/Code_Train/finetune1/LlamaTokenizer/"
cache_dir = "/blue/azare/anthony.rahbany/cache/"

## Load Model

In [4]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model,
    quantization_config=quant_config,
    device_map='auto',
    cache_dir=cache_dir
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model = accelerator.prepare(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.51s/it]


## Load Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## Load Dataset

In [6]:
dataset = load_dataset("dmacres/mimiciii-hospitalcourse-meta", cache_dir=cache_dir)

## Format Data for Supervised Learning

In [7]:
def formatting_supervised_data(data):
    output_text = []

    for i in range(len(data)):
        # ehr = data[i]['extractive_notes_summ']
        # label_summary = data[i]['target_text']
        
        text = f'''Below is an electronic health record for a patient, summarize it with simple terms.
            
            ### Input:
            {data['extractive_notes_summ'][i]}
            
            ### Response:
            {data['target_text'][i]}
            '''
        output_text.append(text)
    
    return output_text

In [8]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=200,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_params,
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
    formatting_func=formatting_supervised_data,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Step,Training Loss
5,2.4127
10,2.3532
15,2.4453
20,2.3277
25,2.1893
30,2.0702
35,2.0028
40,2.0811
45,2.0375
50,2.1981


In [None]:
trainer.save_model("./LlamaModel")
trainer.tokenizer.save_pretrained("./LlamaTokenizer")