In [2]:
import torch
import time

Installing the required libraries

In [None]:
pip uninstall -y transformers

In [None]:
pip install transformers

In [None]:
pip install tensorflow --upgrade

In [None]:
pip install rouge-score

In [None]:
pip install evaluate

In [None]:
pip install datasets

Newer version of peft are generate adapter_model.safetensors instead of adapter_model.bin, which is not suitable for loading the trained model in most systems, so I am uninstalling the old peft module if it exists and downloading the 0.6.2 version

In [9]:
pip uninstall -y peft

[0m

In [None]:
pip install peft==0.6.2 

Some systems have an older version of accelerate installed, so I am uninstalling it and reinstalling the newer version below

In [None]:
pip uninstall -y accelerate 

In [None]:
pip install accelerate

Loading the base model and creating a tokenizer from the AutoTokenizer object

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name)



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

# NOTE:
In the **tokenize function**, it is advised to name the input ids as **'input_ids'** and output or the label as **'labels'** because the model is able to process only these two parameters, and it gave me errors when I try to name it something else, but this may not be same issue in the future, as there may by updates.

In [14]:
def tokenize_function(example):
    start_prompt = 'Summarize the following\n'
    end_prompt = "\n Answer:"
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["output"]]

    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True,
                                     return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['instruction'], padding="max_length", truncation=True,
                                  return_tensors='pt').input_ids



    return example

In [15]:
from datasets import load_dataset

In [16]:
dataset = load_dataset("causal-lm/finance")

Downloading metadata:   0%|          | 0.00/807 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/62020 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6892 [00:00<?, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 62020
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 6892
    })
})

In [18]:
dataset = dataset.remove_columns(['input'])

In [19]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/62020 [00:00<?, ? examples/s]

Map:   0%|          | 0/6892 [00:00<?, ? examples/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input_ids', 'labels'],
        num_rows: 62020
    })
    validation: Dataset({
        features: ['instruction', 'output', 'input_ids', 'labels'],
        num_rows: 6892
    })
})

In [21]:
tokenized_datasets = tokenized_datasets.remove_columns(['instruction', 'output'])

In [22]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"], # q and v are the important attention layers to train on, this information can be extracted by printing the pre trained model
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)


In [23]:
peft_model = get_peft_model(original_model,
                            lora_config)

In [24]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [25]:
from transformers import TrainingArguments, Trainer

In [28]:
output_dir = f'/content/gdrive/MyDrive/finance_peft_training -{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=6, # if the batch size is too big, then 'cuda out of memory error' may arise for machines with small memory
    learning_rate=1e-3,
    num_train_epochs=40,
    logging_steps=1,
    max_steps=40,
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [29]:
peft_trainer.train()

Step,Training Loss
1,41.75
2,40.75
3,38.75
4,36.0
5,35.0
6,33.0
7,28.375
8,26.0
9,23.5
10,21.875


TrainOutput(global_step=40, training_loss=13.0875, metrics={'train_runtime': 70.8041, 'train_samples_per_second': 3.39, 'train_steps_per_second': 0.565, 'total_flos': 166950957219840.0, 'train_loss': 13.0875, 'epoch': 0.0})

Saving the Model locally

In [30]:
peft_model_path="/content/gdrive/MyDrive/peft-finance-checkpoint"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('/content/gdrive/MyDrive/peft-finance-checkpoint/tokenizer_config.json',
 '/content/gdrive/MyDrive/peft-finance-checkpoint/special_tokens_map.json',
 '/content/gdrive/MyDrive/peft-finance-checkpoint/tokenizer.json')

Loading and running the original model and peft trained model in cpu

In [31]:
from peft import PeftModel, PeftConfig


model_name = 'google/flan-t5-base'

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       '/content/gdrive/MyDrive/peft-finance-checkpoint',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [32]:
from transformers import GenerationConfig

In [33]:
peft_model = peft_model.to('cpu')
original_model = original_model.to('cpu')

In [35]:
content = 'The market is currently navigating through a phase of uncertainty. Despite a persistent dominance by growth trades, overall economic growth expectations remain subdued. The forthcoming economic data, particularly jobless claims and nonfarm payroll figures, are anticipated to be critical in shaping market sentiments.'


In [39]:
prompt = f"""
Summarize the following\n

{content}

\nAnswer:"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=350, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

print('--------------------------------------------------')
print(f'BASELINE HUMAN SUMMARY:\n{baseline_context}')
print('--------------------------------------------------')


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=350, num_beams=1, do_sample=True, temperature=0.5))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print('--------------------------------------------------')
print(f'PEFT MODEL: {peft_model_text_output}')

--------------------------------------------------
BASELINE TEXT:
There have been a number of significant technological advancements in the last year. In particular, artificial intelligence and robots have gained much attention as their capabilities have become increasingly sophisticated. Virtual assistants, autonomous cars and home automation systems are just a few of the recent advances that have made an impact on our daily lives. Additionally, quantum computing is another rapidly emerging technology that has the potential to revolutionize the computing industry.
--------------------------------------------------
ORIGINAL MODEL:
The outlook for the market is largely unchanged.
--------------------------------------------------
PEFT MODEL: Despite the recent strong performance, the market remains a little jittery in the wake of the recent economic growth.
