# HuggingFace Analysis

In [1]:
import os
import json
from lib.alpaca_data import AlpacaDataset
from transformers import AutoTokenizer, AutoModelForCausalLM, get_cosine_schedule_with_warmup, default_data_collator
import datasets
import torch
from accelerate import Accelerator
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader
import deepspeed
import gc, psutil, threading
from tqdm import tqdm



[2024-01-24 18:43:43,929] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
# Deepspeed
# 1. mock up the launcher
import os
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994'
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"

# 2. fix local_rank arg if it wasn't passed, in HF Trainer case it is:
# if args.local_rank == -1: 
#     args.local_rank = 0

# 3. finally init deepspeed dist and set the default device
deepspeed.init_distributed()

[2024-01-24 18:43:44,405] [INFO] [comm.py:616:init_distributed] cdb=None
[2024-01-24 18:43:44,406] [INFO] [comm.py:643:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [3]:
# Converting Bytes to Megabytes
def b2mb(x):
    return int(x / 2**20)

class TorchTracemalloc:
    def __enter__(self):
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
        self.begin = torch.cuda.memory_allocated()
        self.process = psutil.Process()

        self.cpu_begin = self.cpu_mem_used()
        self.peak_monitoring = True
        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
        peak_monitor_thread.daemon = True
        peak_monitor_thread.start()
        return self

    def cpu_mem_used(self):
        """get resident set size memory for the current process"""
        return self.process.memory_info().rss

    def peak_monitor_func(self):
        self.cpu_peak = -1

        while True:
            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)

            # can't sleep or will not catch the peak right (this comment is here on purpose)
            # time.sleep(0.001) # 1msec

            if not self.peak_monitoring:
                break

    def __exit__(self, *exc):
        self.peak_monitoring = False

        gc.collect()
        torch.cuda.empty_cache()
        self.end = torch.cuda.memory_allocated()
        self.peak = torch.cuda.max_memory_allocated()
        self.used = b2mb(self.end - self.begin)
        self.peaked = b2mb(self.peak - self.begin)

        self.cpu_end = self.cpu_mem_used()
        self.cpu_used = b2mb(self.cpu_end - self.cpu_begin)
        self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin)
        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")

In [4]:
accelerator = Accelerator()

In [5]:
LLAMA_WEIGHTS_PATH = '/media/anique/Data/projects/llama-weights/llama2-7B'

In [6]:
os.chdir('../')

In [7]:
os.listdir()

['wandb',
 'evaluate.py',
 'len_dist.png',
 'scratch.py',
 'huggingface',
 'startpod',
 'wikitable_train.json',
 'lib',
 'determine_max_length.py',
 'requirements.txt',
 'compute_accuracy.py',
 '.vscode',
 'scripts',
 '.git',
 'train.sh',
 'parallama',
 '.gitignore',
 'merged_dataset_insta_4chan.json',
 'train.py',
 'merge_lora.py',
 'wikisql_lora',
 'phase2_params',
 'tests',
 'mypy.ini',
 'llama2-7B.pickle',
 'gpt2',
 'generate.py',
 'alpaca_data_cleaned.json',
 '.idea',
 '.github',
 'evaluation',
 'memory.prof',
 'README.md',
 'hf_generate.py',
 'hf_evaluate.py']

In [8]:
# Load the alpaca dataset
with open('alpaca_data_cleaned.json', 'r') as f:
    alpaca_data = json.load(f)


In [9]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA_WEIGHTS_PATH)

In [10]:
dataset = AlpacaDataset(path='alpaca_data_cleaned.json', split='train', tokenizer=tokenizer, split_percentage=0.8, alpaca_mix=0.0)

Processing data...


100%|██████████| 41553/41553 [00:12<00:00, 3456.91it/s]
0it [00:00, ?it/s]


## Create a huggingface trainer compatible dataset

In [11]:
list_dataset = []
for sample in dataset:
    list_dataset.append(
        {
            'prompt': sample[0], 
            'response': sample[1]
        }
    )

In [12]:
# num_samples = len(list_dataset)
# train_perc = 0.8
# train_end_idx = int(train_perc * num_samples)
# list_dataset_train = list_dataset[:train_end_idx]
# list_dataset_val = list_dataset[train_end_idx:]

In [13]:
hf_dataset = datasets.Dataset.from_list(list_dataset)

In [14]:
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

In [15]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 33179
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 8295
    })
})

In [16]:
# function to tokenize the inputs and outputs 
def preprocess_function(examples, tokenizer=None, max_length=512):
    batch_size = len(examples['prompt'])
    model_inputs = tokenizer(examples['prompt'])
    labels = tokenizer(examples['response'], add_special_tokens=False)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.eos_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]

    model_inputs['input_ids'] = torch.stack(model_inputs['input_ids'])
    model_inputs['attention_mask'] = torch.stack(model_inputs['attention_mask'])
    model_inputs['labels'] = torch.stack(model_inputs['labels'])
    return model_inputs

In [17]:
with accelerator.main_process_first():
    hf_dataset_tk = hf_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer, 'max_length': 512}, remove_columns=hf_dataset['train'].column_names)
accelerator.wait_for_everyone()

Map:   0%|          | 0/33179 [00:00<?, ? examples/s]

Map:   0%|          | 0/8295 [00:00<?, ? examples/s]

In [18]:
hf_dataset_tk['train'][0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [19]:
tokenizer.decode(hf_dataset_tk['train'][0]['input_ids'])

"</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s

## Prepare the huggingface trainer

In [20]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.05)

In [21]:
LR = 0.0001
EPOCHS = 7 
BATCH_SIZE = 1
n_accumulation_steps = 8

In [22]:
train_dataloader = DataLoader(hf_dataset_tk['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=default_data_collator)


In [23]:
next(iter(train_dataloader))

{'input_ids': tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,    

In [24]:
model = AutoModelForCausalLM.from_pretrained(LLAMA_WEIGHTS_PATH, torch_dtype=torch.bfloat16)
print("Converting model for PEFT training")
model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


Converting model for PEFT training


In [25]:
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.12433454005023165


In [26]:
n_steps = len(train_dataloader) // n_accumulation_steps

In [27]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=n_accumulation_steps, num_training_steps=n_steps * EPOCHS)

In [28]:
model, train_dataloader, optimizer, lr_scheduler = accelerator.prepare(model, train_dataloader, optimizer, lr_scheduler)

In [29]:
accelerator.state

Distributed environment: MULTI_GPU
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda:0

Mixed precision type: no

In [30]:
is_ds_zero3 = False
if getattr(accelerator.state, 'deepspeed_plugin', None):
    is_ds_zero3 = accelerator.state.deepspeed_plugin.zero_stage == 3
    

In [None]:
for epoch in range(EPOCHS):
    with TorchTracemalloc() as tracemalloc:
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

  3%|▎         | 1086/33179 [06:34<3:17:07,  2.71it/s]

In [None]:
a = next(iter(train_dataloader))

In [None]:
def test_args(**kwargs):
    print(kwargs.keys())

In [None]:
test_args(**a)



In [None]:
a['input_ids'].shape

In [None]:
len(a['input_ids'])

In [43]:
hf_dataset_tk['train'][:10]['input_ids']

[[2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
