# prefixtune: default program

In [1]:
from default import *
import os, sys

  from .autonotebook import tqdm as notebook_tqdm


## Run the default solution on small

In [2]:
device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')

basemodel = 'distilgpt2'
table_to_text = TableToText("peft", basemodel=basemodel)
model = AutoModelForCausalLM.from_pretrained(basemodel)
model.to(device)
decoder_output = table_to_text.decode(model, '../data/input/small.txt')
print("\n".join(decoder_output))

10it [00:07,  1.30it/s]

0||  ___________   A new book is coming out.   A new book is coming out.  A new book is coming out. A new book is coming out. A new book is coming out. A new
1||  ____________    I’ve been working on this project for a while now, and I’ve been working on this project for a while now, and I’ve been working on this project for a while now,
2||  __________________________________________   A few years ago, I was going to make a list of the most important things you will need to know to know about Bitcoin and Bitcoin. Bitcoin is the world's first digital currency.  Bitcoin is a
3||  __________________________________________________________________________    The U.S. Department of Homeland Security (DHS) announced today that the Department of Homeland Security (DHS) has approved the release of a new report released by the Department of Homeland Security (DHS
4||  ____________________________________________   If you've been in the news for the past few years, you've been wondering what's 




Ignore the warnings from the transformers library. They are expected to occur.

## Evaluate the default output

In [3]:
import sacrebleu

bleu = sacrebleu.metrics.BLEU(effective_order=True)

def compute_bleu(references, output_data):
    bleu_score = 0.0
    if len(references) == len(output_data):
        score = 0.0
        total = 0.0
        for line in output_data:
            r = references[line[0]]
            h = line[1]
            score += bleu.sentence_score(h, r).score
            total += 1.
        bleu_score = score / total
    return bleu_score

In [4]:
output = "\n".join(decoder_output)

references = {}
ref_data = []
with open( '../data/reference/small.out', 'r') as ref:
    ref_data = list(filter(lambda k: k, [str(x) for x in ref.read().splitlines()]))
    for line in ref_data:
        src_id, _, suggested_reference = line.split('||')
        references.setdefault(src_id, [])
        references[src_id].append(suggested_reference)

output_data = list(filter(lambda k: k, [str(x) for x in output.splitlines()]))
output_data = [line.split('||') for line in output_data]
output_data = output_data[:len(ref_data)]

print(f"bleu score: {compute_bleu(references, output_data)}")

bleu score: 0.9298078296758284


See `bleu.py`

## Documentation

We used the PeftModel from transformers library which, according to the documentation (https://huggingface.co/docs/peft/package_reference/peft_model) is the base model class for specifying the base Transformer model and configuration to apply a PEFT method to it.
PEFT stands for Parameter-Efficient Fine-Tuning which is a framework to facilitate different fine-tuning under this category. The implementaion is as follows: 

In [5]:
from peft import get_peft_model, PrefixTuningConfig, PeftModel,  TaskType, PeftConfig

def train(self):
        data_loaders = self.get_data(splits=("train", ))
        model = AutoModelForCausalLM.from_pretrained(self.basemodel)

        # You can print the parameters for debugging or understanding the code
        # but make sure you comment it out otherwise it will pollute the output
        # that is produced for dev and test
        #model.print_trainable_parameters()
        
        peft_config = PrefixTuningConfig( task_type= TaskType.CAUSAL_LM ,prefix_projection= self.prefixprojection , inference_mode=False, num_virtual_tokens= self.virtualtokens)
        model = get_peft_model(model, peft_config)
        # model.print_trainable_parameters()

        # TODO
        # if using HF peft module, then add calls to PrefixTuningConfig and get_peft_model
        # which will take num_virtual_tokens which is set to self.virtualtokens and
        # prefix_projection which is set to self.prefixprojection

        optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr)
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=(len(data_loaders["train"]) * self.epochs),
        )
        model = model.to(device)

        model.train()
        for epoch in range(self.epochs):
            # TODO rest of the training steps for prefix tuning
            for step, batch in enumerate( tqdm( data_loaders['train'] ) ):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            if epoch == self.epochs - 1:
                epoch_str = '' # last epoch so do not use epoch number in model filename
            else:
                epoch_str = str(epoch)
            savefile = self.modelfile + epoch_str + self.modelsuffix
            model.save_pretrained(savefile)

Since we didn't need to set model.training variable in every step, it's been moved outside the loop. 

We can load the pretrained model with AutoModelForCausalLM.from_pretrained method then we need to create a proper config obj by calling PrefixTuningConfig and pass the parameters accordingly, now we can instanciate PeftModel which is a wrapper around the loaded model. This wrapper freezes the model weights and adds some layers depending to the passed config which in this case implements prefix tuning.

Also the output contains losses accumulated by crossentropy loss that can be backpropagated through the network.

Because of limited memory resources on GPU, we alse reduced batch size to 12.

finally, after training, related information about additional layers and structure created around the original model is saved in a predefined directory. we can reload this information by calling following methods:

In [6]:
modelfile = '../data/peft'
modelsuffix = '.pt'
basemodel = 'distilgpt2'
config = PeftConfig.from_pretrained(modelfile + modelsuffix)
model = AutoModelForCausalLM.from_pretrained(basemodel)
model = PeftModel.from_pretrained(model, modelfile + modelsuffix)
model.to(device)

PeftModelForCausalLM(
  (base_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-5): 6 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (prompt_encoder): Modu

## Analysis

just implementing the prefix fine tuning is not enough and the model is not able to produce relevante information as we can see in small tests results:

    0||  The Alimentum is located in the riverside area near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city
and the bleu score improved to : 13.648

In [10]:
decoder_output = table_to_text.decode(model, '../data/input/small.txt')
print("\n".join(decoder_output))

10it [00:05,  1.96it/s]

0||  The Alimentum is located in the riverside area near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city
1||  Alimentum is a restaurant located near the riverside near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city
2||  Alimentum is a place in the city centre near the riverside near the riverside near the riverside near the riverside near the riverside near the riverside near the riverside near the riverside near the riverside near the riverside
3||  Alimentum is located near Burger King in the riverside area near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city centre near the city ce




In [11]:
output = "\n".join(decoder_output)

references = {}
ref_data = []
with open( '../data/reference/small.out', 'r') as ref:
    ref_data = list(filter(lambda k: k, [str(x) for x in ref.read().splitlines()]))
    for line in ref_data:
        src_id, _, suggested_reference = line.split('||')
        references.setdefault(src_id, [])
        references[src_id].append(suggested_reference)

output_data = list(filter(lambda k: k, [str(x) for x in output.splitlines()]))
output_data = [line.split('||') for line in output_data]
output_data = output_data[:len(ref_data)]

print(f"bleu score: {compute_bleu(references, output_data)}")

bleu score: 13.648864452792433


the output is not irrelevant anymore compaire to default settings but they contain repeating n-grams and incomplete informations.
so we have changed prefixprojection to True
it mildly improved bleu score to bleu score: 14.79 but the repeating n-grams still exists. so without retraining, we added no_repeat_ngram_size=2, to the generate method and bleu score increas to 8.44 on the small test set, but the irrelevant data came back into context. for example:
    0||  Alimentums in the city center is located near the riverside. It is a child friendly place with a customer rating of 5.  The children friendly atmosphere with the price range of £20-£ range. Located near to the river, Al

So, we thought maybe 50 new tokens were too much. in the next step we decreased max_new_tokens and tested it again. It almost doubled the bleu score i.e.BLEU score : 15.18

The next step was to increase beam width, we increased it to 10 and in the meantime increased temperature so that more combinations could be generated by the model, it also increased BLEU score up to 21%.
We've tested with different virtualtokens but it does not make much of a difference.

Furthermore, when we have a big number for new tokens and we reduce the probability of repeating n-grams to 0, the model starts to generate meaningful but unrelated texts, so the question is which combination of these two is proper for achieving a good result. After some trial and error, it seems like 30 new tokens and 6-gram 0 repeatation is a good point which resulted in a 40 BLEU score for the dev set and a 33 BLEU score for the small test set.

 ### Here is the final generate method parameters:

In [None]:
## Do not run this cell
outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens= 30,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer_pad_token_id,
            do_sample=True,
            num_beams=10,
            top_p=0.9,
            temperature= 1.5,
            no_repeat_ngram_size= 6,
            num_return_sequences=num_sequences
        )

In [12]:
from prefixtune import TableToText
table_to_text = TableToText("peft", basemodel=basemodel)
decoder_output = table_to_text.decode(model, '../data/input/small.txt')
print("\n".join(decoder_output))

10it [00:04,  2.22it/s]

0||  The Alimentum is located in the city centre. It is located in the riverside area near the riverside area. It is located near the
1||  Alimentum is located near Burger King in the riverside area near the riverside area near Burger King. It is located near Burger King. It
2||  The Alimentum is located in the city centre near the riverside near the riverside. Located near the riverside, Alimentum is a
3||  Alimentum is located near Burger King in the riverside area near Burger King. It is located near Burger King and has a customer rating of 5
4||  The Alimentum is located in the riverside area near the riverside near the riverside. The Alimentum is located near the riverside
5||  Alimentum is located near Burger King in the riverside near Burger King. It is located near Burger King, located near the riverside near Burger
6||  Alimentum is located in the riverside near the riverside near the city centre. Alimentum is located near the city centre near the city
7||  Alimentum is locate




In [13]:
output = "\n".join(decoder_output)

references = {}
ref_data = []
with open( '../data/reference/small.out', 'r') as ref:
    ref_data = list(filter(lambda k: k, [str(x) for x in ref.read().splitlines()]))
    for line in ref_data:
        src_id, _, suggested_reference = line.split('||')
        references.setdefault(src_id, [])
        references[src_id].append(suggested_reference)

output_data = list(filter(lambda k: k, [str(x) for x in output.splitlines()]))
output_data = [line.split('||') for line in output_data]
output_data = output_data[:len(ref_data)]

print(f"bleu score: {compute_bleu(references, output_data)}")

bleu score: 33.85715455043608
