In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB__SERVICE_WAIT"] = "300"
os.environ["HF_HOME"] = "/NS/llm-1/nobackup/afkhan/HF_CACHE/Misc"
os.environ["HF_DATASETS_CACHE"] = "/NS/llm-1/nobackup/afkhan/HF_CACHE/Datasets"
os.environ["TRANSFORMERS_CACHE"] = "/NS/llm-1/nobackup/afkhan/HF_CACHE/Models"

In [2]:
cache_dir = os.getenv("TRANSFORMERS_CACHE")

In [3]:
# !pip install transformers datasets bitsandbytes deepspeed accelerate

In [4]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import bitsandbytes as bnb

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# !pip install wandb
import wandb
from utils import print_trainable_parameters

## Load Model

In [6]:
model_name = 'Llama-2-7b-hf'
model_path = "/NS/llm-1/nobackup/vnanda/llm_base_models/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.70s/it]


## Load Dataset and Preprocess

In [7]:
ds_name = "weyaxi--sci-datasets"
ds = load_dataset("Weyaxi/sci-datasets", "alpaca")

In [8]:
# Keep only 100 examples for now
ds['train'] = ds['train'].select(range(100))

In [9]:
def merge_columns(example):
    example['text'] = '### Instruction: ' + example['instruction'] + ' ### Answer: ' + example['output']
    return example

In [10]:
ds['train'] = ds['train'].map(merge_columns)

In [11]:
ds['train'][0]

{'instruction': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
 'output': 'Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.',
 'source': 'pubmedqa_filtered',
 'text': '### Instruction: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? ### Answer: Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and hig

In [12]:
ds = ds.map(
    lambda samples: tokenizer(samples["text"]), batched=True,
)

## Setting Hyperparams

In [13]:
## Optimizer Mode

use_adamw_bnb_8bit = True

## Wandb Related

WANDB_PROJECT = "FSDP-Analysis"
WANDB_RUN_NAME = f"{model_name}-{ds_name}-full-finetune" + f"-adamw_bnb_8bit" if use_adamw_bnb_8bit else ""

## Logging Related

REPORT_TO = "wandb"
OUTPUT_DIR = f"./output/{model_name}-{ds_name}-full-finetune" + f"-adamw_bnb_8bit" if use_adamw_bnb_8bit else ""
LOGGING_DIR = f"./logs/{model_name}-{ds_name}-full-finetune" + f"-adamw_bnb_8bit" if use_adamw_bnb_8bit else ""
LOGGING_STRATEGY = "steps"
LOGGING_STEPS = 10

## Training Duration Related

MAX_STEPS = 1000

## Optimizer Related

LEARNING_RATE = 2e-4
LR_SCHEDULER_TYPE = "linear"
WARMUP_RATIO = 0.1
OPTIMIZER = 'adamw_bnb_8bit'

## Batch Related

PER_DEVICE_TRAIN_BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH_SIZE = 8

In [14]:
training_args = TrainingArguments(
    # Logging Related
    report_to=REPORT_TO,
    output_dir = OUTPUT_DIR,
    logging_dir = LOGGING_DIR,
    logging_strategy = LOGGING_STRATEGY,
    logging_steps = LOGGING_STEPS,
    # Training Duration Related
    max_steps = MAX_STEPS,
    # Optimizer Related
    learning_rate = LEARNING_RATE,
    lr_scheduler_type = LR_SCHEDULER_TYPE,
    warmup_ratio = WARMUP_RATIO,
    optim = OPTIMIZER,
    # Batch Related
    per_device_train_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=ds['train'],
    eval_dataset=ds['train'],
)

max_steps is given, it will override any value given in num_train_epochs


In [16]:
# Configure Wandb project and run

wandb.init(project=WANDB_PROJECT, name=WANDB_RUN_NAME)
wandb.config.update(training_args)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33maflah[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
trainer.train()

[2024-07-16 12:18:28,956] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/NS/llm-1/nobackup/afkhan/anaconda3/envs/fsdp_env/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
10,2.0514
20,1.0746
30,0.6147
40,0.4442
50,0.3964
60,0.4497
70,0.4246
80,0.5027
90,0.5001
100,0.5949


TrainOutput(global_step=1000, training_loss=0.21245726135373116, metrics={'train_runtime': 2045.723, 'train_samples_per_second': 3.911, 'train_steps_per_second': 0.489, 'total_flos': 4.432174382363443e+16, 'train_loss': 0.21245726135373116, 'epoch': 76.92307692307692})

In [18]:
# Save Model
model.save_pretrained(f'Saves/{model_name}-{ds_name}-full-finetune')