# prefixtune: default program

In [1]:
from default import *
import os, sys

  from .autonotebook import tqdm as notebook_tqdm


## Run the default solution on small

In [3]:
device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')

basemodel = 'distilgpt2'
table_to_text = TableToText("peft", basemodel=basemodel)
model = AutoModelForCausalLM.from_pretrained(basemodel)
model.to(device)
decoder_output = table_to_text.decode(model, '../data/input/small.txt')
print("\n".join(decoder_output))

10it [00:02,  3.46it/s]

0||  ___________________________________________________________________________    This is the latest in a series of articles on the evolution of the internet in the United States and Europe.  In the United States, the United States has become the world's largest internet service provider. 
1||  __________________________________________________________________________    We’ve been working hard to improve the quality of our products for the past few years. We’ve been working hard to improve the quality of our products for the past few years. We
2||  _______________________________________________   The following is a blog post on my blog. The following is a blog post on my blog. The following is a blog post on my blog. The following is a blog post on my blog. 
3||  中国自己自己自己自己自己自己自己自己自己�
4||  _____________________   The following is a list of the most common and most common reasons why you should buy a laptop or laptop.  1. The first thing you need to know is that you need to buy a la




Ignore the warnings from the transformers library. They are expected to occur.

## Evaluate the default output

In [6]:
import sacrebleu

bleu = sacrebleu.metrics.BLEU(effective_order=True)

def compute_bleu(references, output_data):
    bleu_score = 0.0
    if len(references) == len(output_data):
        score = 0.0
        total = 0.0
        for line in output_data:
            r = references[line[0]]
            h = line[1]
            score += bleu.sentence_score(h, r).score
            total += 1.
        bleu_score = score / total
    return bleu_score

output = "\n".join(decoder_output)

references = {}
ref_data = []
with open( '../data/reference/small.out', 'r') as ref:
    ref_data = list(filter(lambda k: k, [str(x) for x in ref.read().splitlines()]))
    for line in ref_data:
        src_id, _, suggested_reference = line.split('||')
        references.setdefault(src_id, [])
        references[src_id].append(suggested_reference)

output_data = list(filter(lambda k: k, [str(x) for x in output.splitlines()]))
output_data = [line.split('||') for line in output_data]
output_data = output_data[:len(ref_data)]

print(f"bleu score: {compute_bleu(references, output_data)}")

bleu score: 1.0794768613240795


See `bleu.py`

## Documentation

We used the PeftModel from transformers library which, according to the documentation (https://huggingface.co/docs/peft/package_reference/peft_model) is the base model class for specifying the base Transformer model and configuration to apply a PEFT method to it.
PEFT stands for Parameter-Efficient Fine-Tuning which is a framework to facilitate different fine-tuning under this category. The implementaion is as follows: 

In [None]:
from peft import get_peft_model, PrefixTuningConfig, PeftModel,  TaskType, PeftConfig

def train(self):
        data_loaders = self.get_data(splits=("train", ))
        model = AutoModelForCausalLM.from_pretrained(self.basemodel)
        # print('train loop cdalled')
        # You can print the parameters for debugging or understanding the code
        # but make sure you comment it out otherwise it will pollute the output
        # that is produced for dev and test
        #model.print_trainable_parameters()
        peft_config = PrefixTuningConfig( task_type= TaskType.CAUSAL_LM ,prefix_projection= self.prefixprojection , inference_mode=False, num_virtual_tokens= self.virtualtokens)
        model = get_peft_model(model, peft_config)
        # model.print_trainable_parameters()
        # TODO
        # if using HF peft module, then add calls to PrefixTuningConfig and get_peft_model
        # which will take num_virtual_tokens which is set to self.virtualtokens and
        # prefix_projection which is set to self.prefixprojection

        optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=(len(data_loaders["train"]) * self.epochs),
        )
        model = model.to(device)

        model.train()
        for epoch in range(self.epochs):
            # print(f'epoch {epoch}')
            # TODO rest of the training steps for prefix tuning
            for step, batch in enumerate( tqdm( data_loaders['train'] ) ):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                # print(f'step {step}')

            if epoch == self.epochs - 1:
                epoch_str = '' # last epoch so do not use epoch number in model filename
            else:
                epoch_str = str(epoch)
            savefile = self.modelfile + epoch_str + self.modelsuffix
            model.save_pretrained(savefile)

Since we didn't need to set model.training variable in every step, it's been moved outside the loop. 

We can load the pretrained model with AutoModelForCausalLM.from_pretrained method then we need to create a proper config obj by calling PrefixTuningConfig and pass the parameters accordingly, now we can instanciate PeftModel which is a wrapper around the loaded model. This wrapper freezes the model weights and adds some layers depending to the passed config which in this case implements prefix tuning.

Also, the output contains losses accumulated by crossentrpy loss which we can backpropagate through the network.

Finally, after training, related information about additional layers and structures created around the original model is saved in a predefined directory. We can reload this information by calling the following methods:


In [None]:
config = PeftConfig.from_pretrained(  modelfile + opts.modelsuffix  )
model = AutoModelForCausalLM.from_pretrained( opts.basemodel )
model = PeftModel.from_pretrained( model,  modelfile + opts.modelsuffix  )

also because of limited memory resources on GPU, we reduces batch size to 12.

## Analysis

just implementing the prefix fine tuning is not enough and the model is not able to produce relevante information as we can see in small tests results:

    0||  The Alimentum is located in the riverside area near the riverside area near the riverside area near the riverside area near the riverside area near the riverside area near the riverside area near the riverside area near the riverside
    
By implementing the prefix fine tuning the bleu score improved to : 14.234. The output is no longer irrelevant compared to the default settings, but suffers from duplicate n-grams and incomplete information.
So we have changed prefixprojection to True.
it mildly improved bleu score to 14.79 but the repeating n-grams still exist. so without retraining, we added no_repeat_ngram_size=2, to the generate method, and bleu score changed to 8.44 on the small test set, but the irrelevant data came back into context. for example:

    0||  Alimentums in the city center is located near the riverside. It is a child friendly place with a customer rating of 5.  The children friendly atmosphere with the price range of £20-£ range. Located near to the river, Al

So, we thought maybe 50 new tokens were too much. in the next step we decreased max_new_tokens and tested it again. It almost doubled the bleu score i.e.BLEU score : 15.18

The next step was to increase beam width, we increased it to 10 and in the meantime increased temperature so that more combinations could be generated by the model, it also increased BLEU score up to 21%.
We've tested with different virtualtokens but it does not make much of a difference.

Furthermore, when we have a big number for new tokens and we reduce the probability of repeating n-grams to 0, the model starts to generate meaningful but unrelated texts, so the question is which combination of these two is proper for achieving a good result. After some trial and error, it seems like 30 new tokens and 6-gram 0 repeatation is a good point which resulted in a 40 BLEU score for the dev set and a 33 BLEU score for the small test set.

 ### Here is the final generate method parameters:

In [None]:
outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens= 30,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer_pad_token_id,
            do_sample=True,
            num_beams=10,
            top_p=0.9,
            temperature= 1.5,
            no_repeat_ngram_size= 6,
            num_return_sequences=num_sequences
        )