In [1]:
import torch.nn as nn
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(device)

cuda


In [3]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    cache_dir = "/Llama2-hf",
    device_map="auto",
    token="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"
)

Loading checkpoint shards: 100%|██████████| 2/2 [01:30<00:00, 45.30s/it]


In [4]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", use_fast=True, add_eos_token=True, cache_dir="/Llama2-hf",token="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")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"

In [5]:
train_dataset = Dataset.from_csv('dataset/train.csv')
validation_dataset = Dataset.from_csv('dataset/validation.csv')
#test_dataset = Dataset.from_csv('dataset/test.csv')

Generating train split: 36315 examples [00:00, 271800.92 examples/s]


In [6]:
dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
#    'test': test_dataset
})

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translations'],
        num_rows: 145257
    })
    validation: Dataset({
        features: ['translations'],
        num_rows: 36315
    })
})


In [8]:
peft_config = LoraConfig(
    lora_alpha=16, 
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["down_proj","up_proj","gate_proj"]
)

In [9]:
training_arguments = TrainingArguments(
    output_dir="working/results/",
    eval_strategy="steps",
    optim="paged_adamw_8bit",
    log_level="debug",
    logging_steps=100,
    eval_steps=100,
    learning_rate=1e-4,
    fp16=True,
    do_eval=True,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=2,
    warmup_steps=50,
    max_steps=400,
    lr_scheduler_type="linear"
)

In [10]:
!nvidia-smi

Wed Jul  3 20:26:57 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1070      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   43C    P8             10W /  180W |    4711MiB /   8192MiB |      4%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    dataset_text_field="translations",
    max_seq_length=24,
    tokenizer=tokenizer,
    args=training_arguments
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 145257/145257 [00:06<00:00, 21112.99 examples/s]
Map: 100%|██████████| 36315/36315 [00:01<00:00, 23960.04 examples/s]
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 24
***** Running training *****
  Num examples = 145,257
  Num Epochs = 1
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 2
  Total optimization steps = 400
  Number of trainable parameters = 23,199,744
  attn_output = torch.nn.functional.scaled_dot_product_attention(
 13%|█▎        | 51/400 [43:58<5:10:06, 53.31s/it]

In [None]:
base_model = "meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model, device_map={"": 0}, quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

In [None]:
model = PeftModel.from_pretrained(model, "working/results/checkpoint-500/")