In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    IntervalStrategy,
)

from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig,
    TaskType,
    PeftConfig,
    PeftModel
)

import bitsandbytes as bnb
from datasets import Dataset
from huggingface_hub import notebook_login

import sys
sys.path.insert(0, '../..')
import utils as u

MODEL_SEQ2SEQ = "google/flan-t5-base"
PEFT_MODEL_ID = "flan-base-4bit-005-gender-debias-spanish"
CORPUS_FILE = "20231109_gender_bias_dataset.csv"
HF_USER = "GianniCatBug"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
notebook_login()

cuda


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Download model and tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_SEQ2SEQ)

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_SEQ2SEQ,
    device_map={"":0},
    quantization_config=quantization_config,
)

## Prepare new tokens

In [5]:
print(len(tokenizer))
print(model.get_input_embeddings().num_embeddings)

num_added_tokens = tokenizer.add_tokens(u.new_tokens["SEQ_2_SEQ"])
print(num_added_tokens)

encoding_new_t = tokenizer(" ".join(u.new_tokens["SEQ_2_SEQ"]), return_tensors="pt")
print(encoding_new_t["input_ids"])
print(tokenizer.decode(encoding_new_t["input_ids"][0], skip_special_tokens=False))

model.resize_token_embeddings(len(tokenizer))
print(model.get_input_embeddings().num_embeddings)

32100
32128
10
tensor([[32100,     3, 32101,     3, 32102,     3, 32103,     3, 32104,     3,
         32105,     3, 32106,     3, 32107,     3, 32108,     3, 32109,     1]])
í ñ ú ¡ Í ¿ Á Ó Ú Ñ</s>
32110


# Prepare data

In [6]:
df = pd.read_csv(f"../../data/processed/{CORPUS_FILE}")
df["input"] = "Eliminar sesgo de género del siguiente texto:\n" + df["input"]
df.head(2)

Unnamed: 0,input,target
0,Eliminar sesgo de género del siguiente texto:\...,"Chilkatufe UChile mew, estudiantes mapuche U. ..."
1,Eliminar sesgo de género del siguiente texto:\...,"Biblioteca Central, FCFM Académicas mapuche, FCFM"


## Get input and output max_seq_length

In [7]:
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)
df.shape, train_df.shape, val_df.shape

((31195, 2), (21836, 2), (9359, 2))

In [9]:
source_lengths = pd.Series([
    len(i)
    for i in tokenizer(train_df["input"].to_list())["input_ids"]
])
max_source_length = int(source_lengths.quantile(0.99))

target_lengths = pd.Series([
    len(i)
    for i in tokenizer(train_df["target"].to_list())["input_ids"]
])
max_target_length = int(target_lengths.quantile(0.99))

max_source_length, max_target_length

(244, 228)

In [10]:
source_lengths.describe(), target_lengths.describe()

(count    21836.000000
 mean        80.989101
 std         50.720423
 min         20.000000
 25%         48.000000
 50%         70.000000
 75%        101.000000
 max       2629.000000
 dtype: float64,
 count    21836.000000
 mean        63.413217
 std         51.077284
 min          2.000000
 25%         30.000000
 50%         52.000000
 75%         84.000000
 max       2611.000000
 dtype: float64)

## Create datasets

In [11]:
train_dataset = Dataset.from_pandas(train_df).map(
    lambda data: u.preprocess_function(
        data, tokenizer, max_source_length=max_source_length, max_target_length=max_target_length
    ),
    batched=True,
    remove_columns=["input", "target"]
)

val_dataset = Dataset.from_pandas(val_df).map(
    lambda data: u.preprocess_function(
        data, tokenizer, max_source_length=max_source_length, max_target_length=max_target_length
    ),
    batched=True,
    remove_columns=["input", "target"]
)

train_dataset, val_dataset

Map:   0%|          | 0/21836 [00:00<?, ? examples/s]

Map:   0%|          | 0/9359 [00:00<?, ? examples/s]

(Dataset({
     features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 21836
 }),
 Dataset({
     features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 9359
 }))

# Test Raw Model

In [9]:
prompt = """Eliminar sesgo de género del siguiente texto:
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?
"""
encoding = tokenizer(prompt, return_tensors="pt").to(device)

In [10]:
%%time
generation_config = model.generation_config
generation_config.max_new_tokens = 1000
generation_config.num_return_sequences = 1
#generation_config.temperature = 0.7
generation_config.do_sample = False
#generation_config.top_p = 0.9

with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config,
  )

CPU times: user 2.01 s, sys: 26.1 ms, total: 2.03 s
Wall time: 2.03 s


In [11]:
%%time
print("Prompt:")
print(prompt)
print("Raw model generation:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Prompt:
Eliminar sesgo de género del siguiente texto:
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?

Raw model generation:
Eliminar sexo del siguiente texto: En el Torneo de Innovación Interfacultades UChile te equivocamos invitar a nuestra primer Taller "soy innovador/a"?
CPU times: user 880 µs, sys: 0 ns, total: 880 µs
Wall time: 875 µs


# PEFT

In [12]:
model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    peft_type="LORA",
    task_type=TaskType.SEQ_2_SEQ_LM,    # TaskType.SEQ_2_SEQ_LM
    r=16,                               # rank, 16 tiene mas accuracy
    lora_alpha=32,
    bias="none",
    target_modules=["q", "v"],          # módulos a los que se les quiere aplicar lora
    lora_dropout=0.05                    # prevent overfitting, improve generalization
)

peft_lora_model = get_peft_model(
    prepare_model_for_kbit_training(model),
    lora_config
)

# trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096414524241463
peft_lora_model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 249,319,680 || trainable%: 0.7097201472422875


In [13]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=peft_lora_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [14]:
output_dir = f"../../models/{PEFT_MODEL_ID}.."

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    save_strategy=IntervalStrategy.STEPS,
    save_steps=682,
    push_to_hub=True,
    hub_model_id=f"{HF_USER}/{PEFT_MODEL_ID}",
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=682,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
)

trainer = Seq2SeqTrainer(
    model=peft_lora_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [15]:
print(training_args)

Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=682,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=None,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=Fal

In [16]:
print(peft_lora_model)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32110, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32110, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear4bit(
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (base_layer): Linear4bit(

In [15]:
peft_lora_model.config.use_cache = False
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
682,0.1304,0.05549
1364,0.0628,0.041384
2046,0.0544,0.036213
2728,0.0468,0.030398
3410,0.0427,0.03038
4092,0.0373,0.027713
4774,0.034,0.025612
5456,0.0337,0.02534
6138,0.03,0.024781
6820,0.0289,0.024001




TrainOutput(global_step=6820, training_loss=0.050108268044211646, metrics={'train_runtime': 10896.2887, 'train_samples_per_second': 10.02, 'train_steps_per_second': 0.626, 'total_flos': 3.647791541256192e+16, 'train_loss': 0.050108268044211646, 'epoch': 5.0})

# Save and load

In [16]:
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

peft_lora_model.push_to_hub(
    PEFT_MODEL_ID, use_auth_token=True
)



CommitInfo(commit_url='https://huggingface.co/GianniCatBug/flan-base-4bit-005-gender-debias-spanish/commit/ebd1fe6637101850fbf28e6d80f939c08d87a40c', commit_message='Upload model', commit_description='', oid='ebd1fe6637101850fbf28e6d80f939c08d87a40c', pr_url=None, pr_revision=None, pr_num=None)

# Test fine-tuned model

In [19]:
config = PeftConfig.from_pretrained(f"{HF_USER}/{PEFT_MODEL_ID}")
print(PEFT_MODEL_ID, config.base_model_name_or_path)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(
    config.base_model_name_or_path,
    device_map={"":0},
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
)

print(model.get_input_embeddings().num_embeddings)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
_ = tokenizer.add_tokens(new_tokens = u.new_tokens["SEQ_2_SEQ"])
model.resize_token_embeddings(len(tokenizer))
print(model.get_input_embeddings().num_embeddings) # 32110

# Load the Lora model
model = PeftModel.from_pretrained(model, f"{HF_USER}/{PEFT_MODEL_ID}", device_map={"":0})
print(model.get_input_embeddings().num_embeddings) # 32110
model.eval()

print("Peft model loaded")

flan-base-4bit-005-gender-debias-spanish google/flan-t5-base
32128
32110
32110
Peft model loaded


In [24]:
generation_config = model.generation_config
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 1000
#generation_config.temperature = 0.7
generation_config.do_sample = False
#generation_config.top_p = 0.5

In [25]:
%%time
prompt = """Eliminar sesgo de género del siguiente texto:
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?
"""
encoding = tokenizer(prompt, return_tensors="pt").to(device)

with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config,
  )

CPU times: user 1.53 s, sys: 103 ms, total: 1.63 s
Wall time: 1.63 s


In [26]:
%%time
print("Prompt:")
print(prompt)
print("Fine tuned model generation:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Prompt:
Eliminar sesgo de género del siguiente texto:
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?

Fine tuned model generation:
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿ Soy Innovador/a"? 
CPU times: user 598 µs, sys: 0 ns, total: 598 µs
Wall time: 543 µs
