In [1]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/rawat.m/.conda/envs/llm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda112.so
CUDA SETUP: CUDA runtime path found: /shared/centos7/cuda/11.2/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 112
CUDA SETUP: Loading binary /home/rawat.m/.conda/envs/llm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda112.so...


  warn(msg)


In [2]:
huggingface_dataset_name = 'knkarthick/dialogsum'
dataset = load_dataset(huggingface_dataset_name)
dataset

Found cached dataset csv (/home/rawat.m/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c07c4cf4362c223c/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 610.05it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


In [4]:
model_name = './falcon-7b-instruct'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map='auto'
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.41s/it]


In [6]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [13]:
train_dataset = dataset['train']
train_dataset = train_dataset.remove_columns(['id', 'topic'])
train_dataset

Dataset({
    features: ['dialogue', 'summary'],
    num_rows: 12460
})

In [14]:
# generate prompt
def generate_prompt(data_point):
    prompt = f""" 
    summarize the following conversation:
    
    dialogue:
    {data_point['dialogue']}
    
    summary:
    {data_point['summary']}
    """
    
    return prompt

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt


In [15]:
data = train_dataset.shuffle().map(generate_and_tokenize_prompt)

                                                                   

In [16]:
data

Dataset({
    features: ['dialogue', 'summary', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12460
})

In [17]:
output_dir = "./results"

In [30]:
%load_ext tensorboard
%tensorboard --logdir results/run

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [18]:
from peft import LoraConfig, get_peft_config, get_peft_model

lora_alpha = 32
lora_dropout = 0.05
lora_r = 16

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value"
    ]
)

In [19]:
model = get_peft_model(model, peft_config)

In [20]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [12]:
!pip install -q -U trl

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [33]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [21]:
from transformers import Trainer
import transformers
trainer = Trainer(model = model,
                  train_dataset = data, 
                  args=training_arguments,
                  data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False))

In [22]:
model.config.use_cache = False
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/rawat.m/.netrc


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.9987
20,1.8374
30,1.6329
40,1.453
50,1.4802
60,1.7571
70,1.5758
80,1.4583
90,1.4174
100,1.3591




TrainOutput(global_step=500, training_loss=1.494054765701294, metrics={'train_runtime': 1489.4019, 'train_samples_per_second': 5.371, 'train_steps_per_second': 0.336, 'total_flos': 4.17149134966487e+16, 'train_loss': 1.494054765701294, 'epoch': 0.64})

In [27]:
#load train model
from peft import PeftConfig, PeftModel
peft_model = "./results/checkpoint-500"
config = PeftConfig.from_pretrained(peft_model)

model = AutoModelForCausalLM.from_pretrained('./falcon-7b-instruct',
                                            return_dict=True,
                                             quantization_config=bnb_config,
                                             device_map='auto',
                                             trust_remote_code=True
                                            )

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.53s/it]


In [28]:
tokenizer = AutoTokenizer.from_pretrained('./falcon-7b-instruct/')
tokenizer.pad_token = tokenizer.eos_token

In [30]:
model = PeftModel.from_pretrained(model, peft_model)

In [31]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.2
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id


In [39]:
test_dialog = dataset['test']['dialogue'][1]

In [40]:
%%time
prompt = f"""
Summarize the following conversaton:

conversation:
{test_dialog}

summary:
"""
device = torch.device('cuda')
encoding = tokenizer(prompt, return_tensors='pt').to(device)

with torch.inference_mode():
    outputs = model.generate(input_ids = encoding.input_ids,
                            attention_mask = encoding.attention_mask,
                            generation_config=generation_config)

CPU times: user 18 s, sys: 8.19 s, total: 26.2 s
Wall time: 26.2 s


In [41]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Summarize the following conversaton:

conversation:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes 