In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    IntervalStrategy,
    Trainer,
)

from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig,
    TaskType,
    PeftConfig,
    PeftModel
)

import bitsandbytes as bnb
from datasets import Dataset
from huggingface_hub import notebook_login

import sys
sys.path.insert(0, '../..')
import utils as u

MODEL_CAUSAL = "vilsonrodrigues/falcon-7b-instruct-sharded"
PEFT_MODEL_ID = "falcon-7b-4bit-005-gender-debias-spanish"
CORPUS_FILE = "20231109_gender_bias_dataset.csv"
HF_USER = "GianniCatBug"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

notebook_login()

cuda


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Download model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CAUSAL)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_CAUSAL,
    device_map="auto",
    quantization_config=quantization_config,
    trust_remote_code=True,
)
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████| 15/15 [01:56<00:00,  7.77s/it]


Embedding(65024, 4544)

# Test Raw Model

In [4]:
prompt = f"""
<human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(device)

In [5]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.num_return_sequences = 1
#generation_config.temperature = 0.7
generation_config.do_sample = False
#generation_config.top_p = 0.9

In [6]:
%%time
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config,
  )

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


CPU times: user 3.93 s, sys: 69 ms, total: 4 s
Wall time: 6.82 s


In [7]:
%%time
print("Prompt:")
print(prompt)
print("\nRaw model generation:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Prompt:
<human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?
<assistant>:

Raw model generation:
<human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?
<assistant>: ¿Puedes reescribir el siguiente texto sin sesgo de género?
Dentro del Torneo de Innovación Interfacultades UChile tequeremos invitar a nuestro primer Taller "¿Soy Innovador/a"?
User 
CPU times: user 349 µs, sys: 209 µs, total: 558 µs
Wall time: 435 µs


# Prepare data

In [8]:
df = pd.read_csv(f"../../data/processed/{CORPUS_FILE}")
df["input_f"] = [
     f"""
    <human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?
    {i}
    <assistant>: {t}
    <|endoftext|>""".strip()
    for i, t in zip(df["input"], df["target"])
]
df.head(2)

Unnamed: 0,input,target,input_f
0,"Chilkatufe UChile mew, estudiantes mapuche U. ...","Chilkatufe UChile mew, estudiantes mapuche U. ...",<human>: ¿Puedes reescribir el siguiente texto...
1,"Biblioteca Central, FCFM Académicas mapuche, FCFM","Biblioteca Central, FCFM Académicas mapuche, FCFM",<human>: ¿Puedes reescribir el siguiente texto...


## Get input and output max_seq_length

In [9]:
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)
df.shape, train_df.shape, val_df.shape

((31195, 3), (21836, 3), (9359, 3))

In [11]:
source_lengths = pd.Series([
    len(i)
    for i in tokenizer(train_df["input_f"].to_list())["input_ids"]
])
max_source_length = int(source_lengths.quantile(0.962))
print(max_source_length, source_lengths.quantile(0.99))
source_lengths.describe()

250 361.0


count    21836.000000
mean       117.391097
std         76.827688
min         31.000000
25%         71.000000
50%        101.000000
75%        143.000000
max       4925.000000
dtype: float64

## Create datasets

In [12]:
train_dataset = Dataset.from_pandas(train_df).map(
    lambda data: tokenizer(
        data["input_f"],
        max_length=max_source_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device),
    batched=True,
    remove_columns=["input", "target", "input_f"]
)

val_dataset = Dataset.from_pandas(val_df).map(
    lambda data: tokenizer(
        data["input_f"],
        max_length=max_source_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device),
    batched=True,
    remove_columns=["input", "target", "input_f"]
)

train_dataset, val_dataset

Map: 100%|████████████████████████████████████████████████████████████████████| 21836/21836 [00:02<00:00, 10123.97 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████| 9359/9359 [00:01<00:00, 9277.55 examples/s]


(Dataset({
     features: ['__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 21836
 }),
 Dataset({
     features: ['__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 9359
 }))

# PEFT

In [13]:
model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "query_key_value",
    ],       
)

peft_lora_model = get_peft_model(
    prepare_model_for_kbit_training(model),
    lora_config,
)

peft_lora_model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 6,926,439,296 || trainable%: 0.06812435363037071


In [14]:
output_dir = f"../../models/{PEFT_MODEL_ID}"

training_args = TrainingArguments(
    output_dir=output_dir,
	per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    save_strategy=IntervalStrategy.STEPS,
    save_steps=682,
    push_to_hub=True,
    hub_model_id=f"{HF_USER}/{PEFT_MODEL_ID}",
    learning_rate=2e-4,
    fp16=True,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=682,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
)

trainer = Trainer(
    model=peft_lora_model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [15]:
print(training_args)

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=682,
evaluation_strategy=steps,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model

In [16]:
print(peft_lora_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): FalconForCausalLM(
      (transformer): FalconModel(
        (word_embeddings): Embedding(65024, 4544)
        (h): ModuleList(
          (0-31): 32 x FalconDecoderLayer(
            (self_attention): FalconAttention(
              (maybe_rotary): FalconRotaryEmbedding()
              (query_key_value): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4544, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4672, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4544, out_features=4672, bias=False)
              )
           

In [14]:
peft_lora_model.config.use_cache = False
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
682,1.2262,0.99901
1364,0.9641,0.94355
2046,0.8973,0.913257
2728,0.8806,0.88814
3410,0.8338,0.875492
4092,0.8207,0.862699
4774,0.7838,0.859339
5456,0.7849,0.853606
6138,0.7634,0.854031
6820,0.7611,0.854189




TrainOutput(global_step=6820, training_loss=0.8715902557820518, metrics={'train_runtime': 73602.7452, 'train_samples_per_second': 1.483, 'train_steps_per_second': 0.093, 'total_flos': 1.0853572088832e+18, 'train_loss': 0.8715902557820518, 'epoch': 5.0})

# Save and load

In [23]:
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

peft_lora_model.push_to_hub(
    PEFT_MODEL_ID, use_auth_token=True
)



CommitInfo(commit_url='https://huggingface.co/GianniCatBug/falcon-7b-4bit-005-gender-debias-spanish/commit/10d83fc9c946e87061a52d6876e9542ce692598d', commit_message='Upload model', commit_description='', oid='10d83fc9c946e87061a52d6876e9542ce692598d', pr_url=None, pr_revision=None, pr_num=None)

# Test fine-tuned model

In [2]:
REVISION = "87ae1730160cf7022b4a02584223fa82f3e6fe52"

config = PeftConfig.from_pretrained(f"{HF_USER}/{PEFT_MODEL_ID}", revision=REVISION)
print(PEFT_MODEL_ID, config.base_model_name_or_path)

# load base LLM model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    device_map="auto",
    trust_remote_code=True,
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, f"{HF_USER}/{PEFT_MODEL_ID}", revision=REVISION, device_map="auto")
model.eval()
print("Peft model loaded")

falcon-7b-4bit-005-gender-debias-spanish vilsonrodrigues/falcon-7b-instruct-sharded


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Peft model loaded


In [3]:
generation_config = model.generation_config
generation_config.num_return_sequences = 1
#generation_config.temperature = 0.7
generation_config.do_sample = False
#generation_config.top_p = 0.5
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [6]:
%%time
# Wall time: 3.29 s
prompt = f"""
<human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?"
Estimados estudiantes: Los alumnos que inician las clases este mes, deben inscribirse en alumnos.uchile.cl
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(device)
input_parts = prompt.split("\n")
prompt_len = len(
    tokenizer(
        " ".join(input_parts[1:-1] if "<assistant>" in prompt else input_parts[1:]),
        return_tensors="pt"
    ).to(device).input_ids[0]
)
generation_config.max_new_tokens = int(prompt_len * 1.5)
print(generation_config.max_new_tokens, prompt_len)

with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config,
  )

43 29
CPU times: user 2.46 s, sys: 86 ms, total: 2.55 s
Wall time: 2.54 s


In [7]:
print(len(outputs[0]))
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

99
<human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?"
Estimados estudiantes: Los alumnos que inician las clases este mes, deben inscribirse en alumnos.uchile.cl
<assistant>: Estimad@s estudiantes: Las/os estudiantes que inician las clases este mes, deben inscribirse en alumnos.uchile.cl
<assistant>: Estimad@s estudiantes


In [8]:
output_proc = [
    o
    for o in list(set(
        tokenizer.decode(outputs[0], skip_special_tokens=True).split("<assistant>: ")[1].split("\n")
    ))
    if o
]

In [9]:
%%time
print("Prompt:")
print(prompt)
print("\nFine tuned model generation:")
print(max(output_proc, key=len))

Prompt:
<human>: ¿Puedes reescribir el siguiente texto sin sesgo de género?"
Estimados estudiantes: Los alumnos que inician las clases este mes, deben inscribirse en alumnos.uchile.cl
<assistant>:

Fine tuned model generation:
Estimad@s estudiantes: Las/os estudiantes que inician las clases este mes, deben inscribirse en alumnos.uchile.cl
CPU times: user 28 µs, sys: 15 µs, total: 43 µs
Wall time: 47.4 µs
