In [2]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    AutoModelForSeq2SeqLM,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset

2024-08-08 04:49:39.497531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-08 04:49:39.497641: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-08 04:49:39.626198: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
base_model = "NousResearch/Llama-2-7b-chat-hf"

In [5]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [6]:
device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": "cpu",
    "transformer.h": 0,
    "transformer.ln_f": 0,
}

quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

model = AutoModelForCausalLM.from_pretrained(base_model,device_map='auto', quantization_config=quantization_config)
#tokenizer = AutoTokenizer.from_pretrained("AlekseyKorshuk/vicuna-7b")

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [8]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [16]:
def tokenize_function(examples):
    inputs = examples['instruction']
    targets = examples['output']
    inputs = [tokenizer.bos_token + inp + tokenizer.eos_token for inp in inputs]
    targets = [tokenizer.bos_token + tgt + tokenizer.eos_token for tgt in targets]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [10]:
dataset1 = load_dataset("TIGER-Lab/MathInstruct", split = "train[:500]")


Downloading readme:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/212M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/262039 [00:00<?, ? examples/s]

In [11]:
dataset1 = dataset1.train_test_split(test_size=0.25, seed=42)


In [17]:
tokenized_datasets = dataset1.map(tokenize_function, batched=True, remove_columns=['source'])
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/375 [00:00<?, ? examples/s]



Map:   0%|          | 0/125 [00:00<?, ? examples/s]

In [21]:
training_args = TrainingArguments(
    output_dir='./resultsmath',
    evaluation_strategy="epoch",
    learning_rate=2e-9,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=False,
)



In [22]:
from transformers import Trainer, default_data_collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,18.016766
2,17.793800,18.015556


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


TrainOutput(global_step=750, training_loss=17.826335286458335, metrics={'train_runtime': 2307.6714, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.325, 'total_flos': 1.5315426607104e+16, 'train_loss': 17.826335286458335, 'epoch': 2.0})

In [27]:
trainer.push_to_hub()

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

events.out.tfevents.1723093696.dd8deefd772f.34.1:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

events.out.tfevents.1723093628.dd8deefd772f.34.0:   0%|          | 0.00/5.48k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Atharva1244/resultsmath/commit/bab5ba6c14e6431b6326cd0d94b7c956216d8b51', commit_message='End of training', commit_description='', oid='bab5ba6c14e6431b6326cd0d94b7c956216d8b51', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…