In [3]:
!pip install -q torch datasets peft transformers trl wandb

import os
import torch
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import SFTTrainer
import wandb
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
assert torch.cuda.is_available(), "CUDA is not available. Please check your GPU setup."
wandb.finish()
wandb.init(project="huggingface_causal_lm")


VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[34m[1mwandb[0m: Currently logged in as: [33makshay-sk1906[0m ([33makshay-sk1906-psg-college-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
dataset = load_dataset("causal-lm/cot_alpaca")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model.resize_token_embeddings(len(tokenizer))


Embedding(50258, 768)

In [5]:
def convert_message(batch):
    texts = []
    for instruction, input_, output in zip(batch["instruction"], batch["input"], batch["output"]):
        prompt = f"""
Below is a conversation between a human and an AI agent.

Instruction: {instruction}

Input: {input_}

Output: {output}
"""
        texts.append(prompt)
    return {"text": texts}


prompt_converted = dataset["train"].map(
    convert_message, batched=True, batch_size=8, remove_columns=dataset["train"].column_names
)
validation_converted = dataset["validation"].map(
    convert_message, batched=True, batch_size=8, remove_columns=dataset["validation"].column_names
)

In [14]:
training_dataset = prompt_converted.shuffle(seed=42).select(range(5000))
validation_dataset = validation_converted.shuffle(seed=42).select(range(1000))

In [16]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )

training_dataset = prompt_converted.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
validation_dataset = validation_converted.map(
    tokenize_function, batched=True, remove_columns=["text"]
)

lora_config = LoraConfig(
    r=16,
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.1,
    fan_in_fan_out=True
)
peft_model = get_peft_model(model, lora_config)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

args = TrainingArguments(
    eval_strategy="epoch",
    num_train_epochs=5,
    output_dir="/content/",
    learning_rate=1e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.1,
    logging_steps=100,
    logging_dir="/content/logs/",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",
    max_steps=200
)

trainer = SFTTrainer(
    model=peft_model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator
)


Map:   0%|          | 0/46747 [00:00<?, ? examples/s]

Map:   0%|          | 0/5195 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,3.1626,2.933823




TrainOutput(global_step=200, training_loss=3.1917770385742186, metrics={'train_runtime': 1350.6435, 'train_samples_per_second': 2.369, 'train_steps_per_second': 0.148, 'total_flos': 1683865416499200.0, 'train_loss': 3.1917770385742186, 'epoch': 0.06845212629417301})

In [18]:
training_dataset = prompt_converted.shuffle(seed=42).select(range(5000))
validation_dataset = validation_converted.shuffle(seed=42).select(range(1000))

In [20]:

args = TrainingArguments(
    eval_strategy="epoch",
    num_train_epochs=5,
    output_dir="/content/",
    learning_rate=1e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.1,
    logging_steps=100,
    logging_dir="/content/logs/",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",
)

trainer = SFTTrainer(
    model=peft_model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator
)



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,2.9013,2.649852
2,2.3856,2.166807
4,2.2533,2.087943




TrainOutput(global_step=1560, training_loss=2.5454729618170324, metrics={'train_runtime': 1150.1021, 'train_samples_per_second': 21.737, 'train_steps_per_second': 1.356, 'total_flos': 2179462856491008.0, 'train_loss': 2.5454729618170324, 'epoch': 4.992})

In [25]:
training_dataset = prompt_converted.shuffle(seed=123).select(range(5000))
validation_dataset = validation_converted.shuffle(seed=123).select(range(1000))
args = TrainingArguments(
    eval_strategy="epoch",
    num_train_epochs=5,
    output_dir="/content/",
    learning_rate=3e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.1,
    logging_steps=100,
    logging_dir="/content/logs/",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",
)

trainer = SFTTrainer(
    model=peft_model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator
)



In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,2.0933,1.944037
2,1.9787,1.894977
4,1.9721,1.88358




TrainOutput(global_step=1560, training_loss=2.0199124849759613, metrics={'train_runtime': 1191.343, 'train_samples_per_second': 20.985, 'train_steps_per_second': 1.309, 'total_flos': 2126028084314112.0, 'train_loss': 2.0199124849759613, 'epoch': 4.992})

In [27]:

training_dataset = prompt_converted.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
validation_dataset = validation_converted.map(
    tokenize_function, batched=True, remove_columns=["text"]
)

lora_config = LoraConfig(
    r=16,
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.1,
    fan_in_fan_out=True
)
peft_model = get_peft_model(model, lora_config)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

args = TrainingArguments(
    eval_strategy="epoch",
    num_train_epochs=5,
    output_dir="/content/",
    learning_rate=1e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.1,
    logging_steps=100,
    logging_dir="/content/logs/",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb",
    max_steps=200
)

trainer = SFTTrainer(
    model=peft_model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=training_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator
)


Map:   0%|          | 0/46747 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
