# LangChain Cookbook 👨‍🍳👩‍🍳

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
#!pip install accelerate==0.21.0
#!pip install accelerate

#!pip install transformers==4.28.0
!pip install transformers



In [3]:
!pip install datasets



In [5]:
#!pip install torch



In [4]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
from transformers import LlamaForCausalLM
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/test/backup/daily_tokenizer_0612")
model = LlamaForCausalLM.from_pretrained('/content/drive/MyDrive/test/backup/daily_llama_0612')

model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50257, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=512, bias=False)
          (up_proj): Linear(in_features=512, out_features=1376, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_he

# **get peft model**

In [6]:
!pip install peft



In [7]:
from peft import get_peft_model, LoraConfig, TaskType

In [8]:
list(TaskType)

[<TaskType.SEQ_CLS: 'SEQ_CLS'>,
 <TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>,
 <TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
 <TaskType.TOKEN_CLS: 'TOKEN_CLS'>,
 <TaskType.QUESTION_ANS: 'QUESTION_ANS'>,
 <TaskType.FEATURE_EXTRACTION: 'FEATURE_EXTRACTION'>]

In [10]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False,
                        r=32,
                        lora_alpha=32,
                        lora_dropout=0.1)

In [11]:
model = get_peft_model(model, peft_config)
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(50257, 512, padding_idx=0)
        (layers): ModuleList(
          (0-3): 4 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=512, out_features=512, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=512, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=512, out_features=512, bias=False)
              (v_proj): Linear(


In [12]:
model.print_trainable_parameters()

trainable params: 262,144 || all params: 64,378,368 || trainable%: 0.40719267689420147


## load dataset

In [13]:
from datasets import load_dataset

dataset_cate = load_dataset('heegyu/news-category-balanced-top10')

In [14]:
categories = dataset_cate['train'].to_pandas().category.unique().tolist()
categories.sort()
categories = categories[:4]

dataset_cate = dataset_cate.filter(lambda element: element['category'] in categories)
dataset_cate

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
        num_rows: 29026
    })
})

In [15]:
categories = [x.split(' ')[0].lower() for x in categories]
int2label_cate = {i: categories[i] for i in range(len(categories))}
label2int_cate = {int2label_cate[key]:key for key in int2label_cate}

In [16]:
def gen_label(element):
    category = element['category'].split(' ')[0].lower()
    return {'label': label2int_cate[category], 'category': category}

dataset_cate = dataset_cate.map(gen_label)
dataset_cate = dataset_cate['train'].train_test_split(test_size=0.1)
dataset_cate

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 26123
    })
    test: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 2903
    })
})

In [17]:
from datasets import DatasetDict
from datasets import concatenate_datasets
import random

prompt_format1_cate = """Given the article, what is the topic of the article? article: %s  answer: %s"""
prompt_format2_cate = """Determine the topic of the news article. article: %s answer: %s"""
prompt_format3_cate = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer: %s"""

prompts_cate = [prompt_format1_cate, prompt_format2_cate, prompt_format3_cate]

def gen_prompt_cate(element):
    prompt_format = prompts_cate[random.randint(0, len(prompts_cate)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'], int2label_cate[element['label']])})

train_cate = dataset_cate['train'].map(gen_prompt_cate, remove_columns=dataset_cate['train'].column_names)
train_dataset = train_cate

Map:   0%|          | 0/26123 [00:00<?, ? examples/s]

In [18]:
def tokenize(element):
    tokenizer.pad_token = tokenizer.eos_token
    outputs = tokenizer(
        element['input'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
        padding=True
    )

    return {"input_ids": outputs["input_ids"]}


context_length=128
tokenized_datasets = train_dataset.map(
    tokenize, batched=True, remove_columns=train_dataset.column_names
)
tokenized_datasets

Map:   0%|          | 0/26123 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 26123
})

## train

In [19]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 54])
attention_mask shape: torch.Size([5, 54])
labels shape: torch.Size([5, 54])


In [21]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="peft_llama",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=4,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1_000,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [22]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=3264, training_loss=3.363852706609988, metrics={'train_runtime': 620.2888, 'train_samples_per_second': 168.457, 'train_steps_per_second': 5.262, 'total_flos': 1425718508544000.0, 'train_loss': 3.363852706609988, 'epoch': 4.0})

## **evaluation**

In [24]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/test/backup/daily_tokenizer_0612", padding_side='left')
prompt_format1 = """Given the article, what is the topic of the article? article: %s  answer:"""
prompt_format2 = """Determine the topic of the news article. article: %s answer:"""
prompt_format3 = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer:"""

prompts = [prompt_format1, prompt_format2, prompt_format3]

def gen_valid_prompt_cate(element):
    prompt_format = prompts[random.randint(0, len(prompts)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'])})




valid_dataset = dataset_cate['test'].map(gen_valid_prompt_cate)

context_length=128
valid_dataset = valid_dataset.map(
    tokenize, batched=True, remove_columns=['link', 'headline', 'category', 'short_description', 'authors', 'date', 'input']
)
valid_dataset

Map:   0%|          | 0/2903 [00:00<?, ? examples/s]

Map:   0%|          | 0/2903 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 2903
})

In [25]:
from torch.utils.data import DataLoader

batch_size=4
val_ds = valid_dataset.select(range(100))
val_ds.set_format(type='torch')
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [29]:
import re
from tqdm import tqdm


def acc(pred,label):
  return torch.sum(torch.tensor(pred) == label.squeeze()).item()
model.eval()

val_acc = 0

for step, batch in enumerate(tqdm(val_dl)):
    label = batch['label']

    input_id = batch['input_ids'].to(device)

    pred = model.generate(input_ids=input_id, max_length=70)
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred]

    val_acc += acc(decoded_pred, label)

print("val acc: ", val_acc/len(val_dl.dataset))

100%|██████████| 25/25 [00:04<00:00,  5.10it/s]

val acc:  0.72





In [30]:
model.save_pretrained('/content/drive/MyDrive/test/backup/peft_llama_adapter__')

In [32]:
import os
os.stat('/content/drive/MyDrive/test/backup/peft_llama_adapter__/adapter_model.bin').st_size/(1024*1024)

1.0055246353149414

In [33]:
os.stat('/content/drive/MyDrive/test/backup/daily_llama_0612/pytorch_model.bin').st_size/(1024*1024)

244.59843635559082

In [35]:
from peft import PeftModel
from transformers import LlamaForCausalLM

In [36]:
model_load = LlamaForCausalLM.from_pretrained('/content/drive/MyDrive/test/backup/daily_llama_0612')

In [37]:
model_load = PeftModel.from_pretrained(model_load,'/content/drive/MyDrive/test/backup/peft_llama_adapter__')

In [38]:
model_load.eval()
model_load.to(device)

val_acc = 0

for step, batch in enumerate(tqdm(val_dl)):
    label = batch['label']

    input_id = batch['input_ids'].to(device)

    pred = model_load.generate(input_ids=input_id, max_length=70)
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred]

    val_acc += acc(decoded_pred, label)

print("val acc: ", val_acc/len(val_dl.dataset))

100%|██████████| 25/25 [00:07<00:00,  3.25it/s]

val acc:  0.72



