In [1]:
!nvidia-smi

Mon Dec 19 00:21:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    33W /  70W |      0MiB / 15360MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [None]:
#Dataset class for CodeT5 APPS training data
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from torch.utils import data
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

class APPSDataset(data.Dataset):
    """APPS dataset."""

    def __init__(self, data, tokenizer):
        """
        Args:
            csv_file (string): Path to the csv file
        """
        self.tokenizer = tokenizer
        expanded_data = []
        for sample in data:
            if sample["starter_code"] == '':
                input_string = "\nQUESTION:\n" + sample["question"] + "\n" + sample["starter_code"] + "\n" + "\nUse Standard Input Format\n\nANSWER:\n"
            else:
                input_string = "\nQUESTION:\n" + sample["question"] + "\n" + sample["starter_code"] + "\n" + "\nUse Call-Based Format\n\nANSWER:\n"
            ids_object = self.tokenizer(input_string, return_tensors="pt")
            input_ids = ids_object.input_ids[0]
            if len(input_ids) > 512:
                continue
            if len(sample["solutions"]) > 0:
                sample["solutions"] = json.loads(sample["solutions"])
            for solution in sample["solutions"]:
                ids_object = self.tokenizer(solution, return_tensors="pt")
                input_ids = ids_object.input_ids[0]
                if len(input_ids) > 512:
                    continue
                expanded_data.append({"model_input": input_string, "question": sample["question"], "solution": solution, "starter_code": sample["starter_code"], "input_output": sample["input_output"], "difficulty": sample["difficulty"]})
        self.data = expanded_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.numpy()

        ids_object = self.tokenizer(self.data[idx]["model_input"], return_tensors="pt", padding='max_length')
        input_ids = ids_object.input_ids[0]
        attention_mask = ids_object.attention_mask[0]

        output_ids_object = self.tokenizer(self.data[idx]["solution"], return_tensors="pt", padding='max_length')
        output_ids = output_ids_object.input_ids[0]
        output_attention_mask = output_ids_object.attention_mask[0]

        sample = {"source_ids": input_ids, "source_mask": attention_mask, "target_ids": output_ids, "target_mask": output_attention_mask}
        return sample

In [16]:
#Dataset class for CodeT5 APPS test data
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from torch.utils import data
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

class APPSTestDataset(data.Dataset):
    """APPS dataset."""

    def __init__(self, data, tokenizer, problem_type):
        """
        Args:
            csv_file (string): Path to the csv file
        """
        self.tokenizer = tokenizer
        expanded_data = []
        for sample in data:
            if sample["starter_code"] == '':
                input_string = "\nQUESTION:\n" + sample["question"] + "\n" + sample["starter_code"] + "\n" + "\nUse Standard Input Format\n\nANSWER:\n"
            else:
                input_string = "\nQUESTION:\n" + sample["question"] + "\n" + sample["starter_code"] + "\n" + "\nUse Call-Based Format\n\nANSWER:\n"
            ids_object = self.tokenizer(input_string, return_tensors="pt")
            input_ids = ids_object.input_ids[0]
            if len(input_ids) > 512 or sample["difficulty"] != problem_type:
                continue
            expanded_data.append({"problem_id": sample["problem_id"], "model_input": input_string, "question": sample["question"], "starter_code": sample["starter_code"], "difficulty": sample["difficulty"]})
        self.data = expanded_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.numpy()

        ids_object = self.tokenizer(self.data[idx]["model_input"], return_tensors="pt", padding='max_length')
        input_ids = ids_object.input_ids[0]
        attention_mask = ids_object.attention_mask[0]

        sample = {"source_ids": input_ids, "source_mask": attention_mask}
        return sample

In [17]:
#Dataset class for GPT-2 APPS test data
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from torch.utils import data
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup

class GPTAPPSTestDataset(data.Dataset):
    """APPS dataset."""

    def __init__(self, data, tokenizer, gpt_tokenizer, problem_type):
        """
        Args:
            csv_file (string): Path to the csv file
        """
        self.tokenizer = tokenizer
        expanded_data = []
        for sample in data:
            if sample["starter_code"] == '':
                input_string = "\nQUESTION:\n" + sample["question"] + "\n" + sample["starter_code"] + "\n" + "\nUse Standard Input Format\n\nANSWER:\n"
            else:
                input_string = "\nQUESTION:\n" + sample["question"] + "\n" + sample["starter_code"] + "\n" + "\nUse Call-Based Format\n\nANSWER:\n"
            ids_object = self.tokenizer(input_string, return_tensors="pt")
            input_ids = ids_object.input_ids[0]
            if len(input_ids) > 512 or sample["difficulty"] != problem_type:
                continue
            expanded_data.append({"problem_id": sample["problem_id"], "model_input": input_string, "question": sample["question"], "starter_code": sample["starter_code"], "difficulty": sample["difficulty"]})
        self.data = expanded_data
        self.tokenizer = gpt_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.numpy()

        input_ids = torch.LongTensor(self.tokenizer.encode(self.data[idx]["model_input"], verbose=False)).unsqueeze(0)

        sample = {"source_ids": input_ids}
        return sample

In [None]:
from datasets import load_dataset
ds = load_dataset("codeparrot/apps", split="train")
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
train_data = APPSDataset(ds, tokenizer)

In [None]:
from datasets import load_dataset
test_ds = load_dataset("codeparrot/apps", split="test")
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
test_data = APPSTestDataset(test_ds, tokenizer, "interview")

In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer
gpt_test_ds = load_dataset("codeparrot/apps", split="test")
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_test_data = GPTAPPSTestDataset(gpt_test_ds, tokenizer, gpt_tokenizer, "interview")

In [25]:
len(test_data)

1294

In [17]:
len(gpt_test_data)

664

In [7]:
len(train_data)

86338

T5 fine tuner implementation utilized in part from: https://www.kaggle.com/code/parthplc/t5-fine-tuning-tutorial/notebook

In [None]:
import pytorch_lightning as pl
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams, train_dataset, model):
    super(T5FineTuner, self).__init__()
    self.hparams.update(vars(hparams))
    self.train_dataset = train_dataset
    self.model = model
    self.tokenizer = RobertaTokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return True
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    #tensorboard_logs = {"avg_train_loss": avg_train_loss}
    self.log("avg_train_loss", avg_train_loss)

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self,
                     epoch=None, 
                    batch_idx=None, 
                    optimizer=None, 
                    optimizer_idx=None, 
                    optimizer_closure=None, 
                    on_tpu=None, 
                    using_native_amp=None, 
                    using_lbfgs=None
                     ):

    optimizer.step(closure=optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    dataloader = data.DataLoader(self.train_dataset, batch_size=self.hparams.train_batch_size, shuffle=True, num_workers=8, pin_memory=True)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    return data.DataLoader(self.train_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [6]:
import logging
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [7]:
args_dict = dict(
    data_dir="",
    output_dir="./NLP_Gen_Sum_Project",
    model_name_or_path='Salesforce/codet5-base',
    tokenizer_name_or_path='Salesforce/codet5-base',
    max_seq_length=512,
    learning_rate=1e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=1,
    gradient_accumulation_steps=32,
    n_gpu=1,
    fp_16=False,
    max_grad_norm=1.0,
    seed=42,
)

In [8]:
import argparse
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="avg_train_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    gradient_clip_val=args.max_grad_norm,
    callbacks=[LoggingCallback(), checkpoint_callback],
)

In [None]:
#Train CodeT5 model and save it for future use
model_base = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')
trainer = pl.Trainer(**train_params)
model = T5FineTuner(args, train_data, model_base)
model.load_state_dict(torch.load("./NLP_Gen_Sum_Project/Models/model_codet5_base_APPS.pt"))
trainer.fit(model)
torch.save(model.state_dict(), "./NLP_Gen_Sum_Project/model_codet5_base_APPS.pt")

In [9]:
model_base = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')
model = T5FineTuner(args, test_data, model_base)
model.load_state_dict(torch.load("model_codet5_base_APPS.pt"))

<All keys matched successfully>

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

START: COPIED FROM https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/generation/utils.py

In [11]:
from transformers import BeamScorer
from typing import List, Optional, Tuple
from collections import UserDict
import warnings
import gc

class CodeBeamSearchScorer(BeamScorer):
    r"""
    [`BeamScorer`] implementing standard beam search decoding.
    Adapted in part from [Facebook's XLM beam search
    code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
    Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS
    implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
    Args:
        batch_size (`int`):
            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
        max_length (`int`):
            The maximum length of the sequence to be generated.
        num_beams (`int`):
            Number of beams for beam search.
        device (`torch.device`):
            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
            allocated.
        length_penalty (`float`, *optional*, defaults to 1.0):
            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
            `length_penalty` < 0.0 encourages shorter sequences.
        do_early_stopping (`bool`, *optional*, defaults to `False`):
            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
            The number of beam hypotheses that shall be returned upon calling
            [`~transformer.BeamSearchScorer.finalize`].
        num_beam_groups (`int`):
            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
            See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    """
    @torch.no_grad()
    def __init__(
        self,
        batch_size: int,
        num_beams: int,
        device: torch.device,
        length_penalty: Optional[float] = 1.0,
        lambd: Optional[float] = 0,
        gamma: Optional[float] = 0,
        do_early_stopping: Optional[bool] = False,
        num_beam_hyps_to_keep: Optional[int] = 1,
        num_beam_groups: Optional[int] = 1,
        new_line_token: Optional[int] = 0,
        loop_tokens: Optional[list] = [],
        **kwargs,
    ):
        self.num_beams = num_beams
        self.device = device
        self.length_penalty = length_penalty
        self.lambd = lambd
        self.gamma = gamma
        self.do_early_stopping = do_early_stopping
        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
        self.num_beam_groups = num_beam_groups
        self.group_size = self.num_beams // self.num_beam_groups
        self.new_line_token = new_line_token
        self.loop_tokens = loop_tokens

        self._is_init = False
        self._beam_hyps = [
            BeamHypotheses(
                num_beams=self.num_beams,
                length_penalty=self.length_penalty,
                lambd = self.lambd,
                gamma = self.gamma,
                early_stopping=self.do_early_stopping,
                new_line_token = self.new_line_token,
                loop_tokens = self.loop_tokens
            )
            for _ in range(batch_size)
        ]
        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)

        if not isinstance(num_beams, int) or num_beams <= 1:
            raise ValueError(
                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
                " one should make use of `greedy_search` instead."
            )

        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
            raise ValueError(
                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
            )

        if "max_length" in kwargs:
            warnings.warn(
                "Passing `max_length` to BeamSearchScorer is deprecated and has no effect. "
                "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`"
                ", or `group_beam_search(...)`."
            )

    @property
    def is_done(self) -> bool:
        return self._done.all()
    @torch.no_grad()
    def process(
        self,
        input_ids: torch.LongTensor,
        next_scores: torch.FloatTensor,
        next_tokens: torch.LongTensor,
        next_indices: torch.LongTensor,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
        beam_indices: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor]:
        cur_len = input_ids.shape[-1]
        batch_size = len(self._beam_hyps)
        if not (batch_size == (input_ids.shape[0] // self.group_size)):
            if self.num_beam_groups > 1:
                raise ValueError(
                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
                    f"size of {self.group_size} is expected by the beam scorer."
                )
            else:
                raise ValueError(
                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
                    f"{self.group_size} is expected by the beam scorer."
                )

        device = input_ids.device
        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)

        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
            if self._done[batch_idx]:
                if self.num_beams < len(beam_hyp):
                    raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
                if eos_token_id is None or pad_token_id is None:
                    raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
                # pad the batch
                next_beam_scores[batch_idx, :] = 0
                next_beam_tokens[batch_idx, :] = pad_token_id
                next_beam_indices[batch_idx, :] = 0
                continue

            # next tokens for this sentence
            beam_idx = 0
            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
            ):
                batch_beam_idx = batch_idx * self.group_size + next_index
                # add to generated hypotheses if end of sentence
                if (eos_token_id is not None) and (next_token.item() == eos_token_id):
                    # if beam_token does not belong to top num_beams tokens, it should not be added
                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
                    if is_beam_token_worse_than_top_num_beams:
                        continue
                    if beam_indices is not None:
                        beam_index = beam_indices[batch_beam_idx]
                        beam_index = beam_index + (batch_beam_idx,)
                    else:
                        beam_index = None

                    beam_hyp.add(
                        input_ids[batch_beam_idx].clone(),
                        next_score.item(),
                        beam_indices=beam_index
                    )
                    gc.collect()
                    torch.cuda.empty_cache()
                else:
                    # add next predicted token since it is not eos_token
                    next_beam_scores[batch_idx, beam_idx] = next_score
                    next_beam_tokens[batch_idx, beam_idx] = next_token
                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
                    beam_idx += 1

                # once the beam for next step is full, don't add more tokens to it.
                if beam_idx == self.group_size:
                    break

            if beam_idx < self.group_size:
                raise ValueError(
                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
                )

            # Check if we are done so that we can save a pad step if all(done)
            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
                next_scores[batch_idx].max().item(), cur_len
            )
        
        return UserDict(
            {
                "next_beam_scores": next_beam_scores.view(-1),
                "next_beam_tokens": next_beam_tokens.view(-1),
                "next_beam_indices": next_beam_indices.view(-1),
            }
        )
    @torch.no_grad()
    def finalize(
        self,
        input_ids: torch.LongTensor,
        final_beam_scores: torch.FloatTensor,
        final_beam_tokens: torch.LongTensor,
        final_beam_indices: torch.LongTensor,
        max_length: int,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
        beam_indices: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.LongTensor]:
        batch_size = len(self._beam_hyps)

        # finalize all open beam hypotheses and add to generated hypotheses
        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
            if self._done[batch_idx]:
                continue

            # all open beam hypotheses are added to the beam hypothesis
            # beam hypothesis class automatically keeps the best beams
            for beam_id in range(self.num_beams):
                batch_beam_idx = batch_idx * self.num_beams + beam_id
                final_score = final_beam_scores[batch_beam_idx].item()
                final_tokens = input_ids[batch_beam_idx]
                beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index)

        # select the best hypotheses
        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
        best = []
        best_indices = []
        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)

        # retrieve best hypotheses
        for i, beam_hyp in enumerate(self._beam_hyps):
            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
            for j in range(self.num_beam_hyps_to_keep):
                best_hyp_tuple = sorted_hyps.pop()
                best_score = best_hyp_tuple[0]
                best_hyp = best_hyp_tuple[1]
                best_index = best_hyp_tuple[2]
                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)

                # append hyp to lists
                best.append(best_hyp)

                # append indices to list
                best_indices.append(best_index)

                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score

        # prepare for adding eos
        sent_lengths_max = sent_lengths.max().item() + 1
        sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)

        if len(best_indices) > 0 and best_indices[0] is not None:
            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
        else:
            indices = None

        # shorter batches are padded if needed
        if sent_lengths.min().item() != sent_lengths.max().item():
            assert pad_token_id is not None, "`pad_token_id` has to be defined"
            decoded.fill_(pad_token_id)

        if indices is not None:
            indices.fill_(-1)

        # fill with hypotheses and eos_token_id if the latter fits in
        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
            decoded[i, : sent_lengths[i]] = hypo

            if indices is not None:
                indices[i, : len(best_idx)] = torch.tensor(best_idx)

            if sent_lengths[i] < sent_max_len:
                decoded[i, sent_lengths[i]] = eos_token_id
        
        del self._done
        torch.cuda.empty_cache()
        return UserDict(
            {
                "sequences": decoded,
                "sequence_scores": best_scores,
                "beam_indices": indices,
            }
        )


class BeamHypotheses:
    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool, lambd: float, gamma: float,
                 new_line_token: int, loop_tokens: list):
        """
        Initialize n-best list of hypotheses.
        """
        self.length_penalty = length_penalty
        self.lambd = lambd
        self.gamma = gamma
        self.early_stopping = early_stopping
        self.num_beams = num_beams
        self.beams = []
        self.worst_score = 1e9
        self.new_line_token = new_line_token
        self.loop_tokens = loop_tokens

    def __len__(self):
        """
        Number of hypotheses in the list.
        """
        return len(self.beams)
    @torch.no_grad()
    def add(self, hyp: torch.LongTensor, sum_logprobs: float, beam_indices: Optional[torch.LongTensor] = None):
        """
        Add a new hypothesis to the list.
        """
        loop_counts = 0
        for tok in self.loop_tokens:
            loop_counts += torch.numel(hyp[hyp == tok])
            
        score = (sum_logprobs * (1 + (self.lambd * torch.numel(hyp[hyp == self.new_line_token]))+ (self.gamma * loop_counts))) / (hyp.shape[-1] ** self.length_penalty)
        #score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
        gc.collect()
        torch.cuda.empty_cache()

        if len(self) < self.num_beams or score > self.worst_score:
            self.beams.append((score, hyp, beam_indices))
            if len(self) > self.num_beams:
                sorted_next_scores = sorted([(s, idx) for idx, (s, _, _) in enumerate(self.beams)])
                del self.beams[sorted_next_scores[0][1]]
                self.worst_score = sorted_next_scores[1][0]
                del score
                gc.collect()
                torch.cuda.empty_cache()
            else:
                self.worst_score = min(score, self.worst_score)
                del score
                gc.collect()
                torch.cuda.empty_cache()
    @torch.no_grad()
    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
        """
        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
        one in the heap, then we are done with this sentence.
        """

        if len(self) < self.num_beams:
            return False
        elif self.early_stopping:
            return True
        else:
            cur_score = best_sum_logprobs / cur_len**self.length_penalty
            ret = self.worst_score >= cur_score
            return ret

In [12]:
from transformers.utils import ModelOutput
from transformers.generation_beam_constraints import Constraint
from transformers import LogitsProcessorList, StoppingCriteriaList
from transformers import BeamSearchScorer
import inspect
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
@torch.no_grad()
def generate_beam_search(
    model,
    inputs: Optional[torch.Tensor] = None,
    max_length: Optional[int] = None,
    min_length: Optional[int] = None,
    early_stopping: Optional[bool] = None,
    num_beams: Optional[int] = None,
    repetition_penalty: Optional[float] = None,
    bad_words_ids: Optional[Iterable[int]] = None,
    force_words_ids: Optional[Union[Iterable[int], Iterable[Iterable[int]]]] = None,
    bos_token_id: Optional[int] = None,
    pad_token_id: Optional[int] = None,
    eos_token_id: Optional[int] = None,
    length_penalty: Optional[float] = None,
    no_repeat_ngram_size: Optional[int] = None,
    encoder_no_repeat_ngram_size: Optional[int] = None,
    num_return_sequences: Optional[int] = None,
    max_time: Optional[float] = None,
    max_new_tokens: Optional[int] = None,
    decoder_start_token_id: Optional[int] = None,
    use_cache: Optional[bool] = None,
    num_beam_groups: Optional[int] = None,
    diversity_penalty: Optional[float] = None,
    prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
    logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
    renormalize_logits: Optional[bool] = None,
    stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
    constraints: Optional[List[Constraint]] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_scores: Optional[bool] = None,
    return_dict_in_generate: Optional[bool] = None,
    forced_bos_token_id: Optional[int] = None,
    forced_eos_token_id: Optional[int] = None,
    remove_invalid_values: Optional[bool] = None,
    synced_gpus: Optional[bool] = False,
    exponential_decay_length_penalty: Optional[Tuple[int, float]] = None,
    suppress_tokens: Optional[List[int]] = None,
    begin_suppress_tokens: Optional[List[int]] = None,
    forced_decoder_ids: Optional[List[int]] = None,
    beam_scorer: Optional[BeamScorer] = None,
    **model_kwargs,
):
    r"""
    Parameters:
        inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
            The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
            method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
            should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
            `input_ids`, `input_values`, `input_features`, or `pixel_values`.
        max_length (`int`, *optional*, defaults to `model.config.max_length`):
            The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
            `max_new_tokens`. In general, prefer the use of `max_new_tokens`, which ignores the number of tokens in
            the prompt.
        max_new_tokens (`int`, *optional*):
            The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
        min_length (`int`, *optional*, defaults to `model.config.min_length` or 10 if the config does not set any value):
            The minimum length of the sequence to be generated.
        early_stopping (`bool`, *optional*, defaults to `False`):
            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
        num_beams (`int`, *optional*, defaults to `model.config.num_beams` or 1 if the config does not set any value):
            Number of beams for beam search. 1 means no beam search.
        repetition_penalty (`float`, *optional*, defaults to `model.config.repetition_penalty` or 1.0 if the config does not set any value):
            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
        pad_token_id (`int`, *optional*, defaults to `model.config.pad_token_id`):
            The id of the *padding* token.
        bos_token_id (`int`, *optional*, defaults to `model.config.bos_token_id`):
            The id of the *beginning-of-sequence* token.
        eos_token_id (`int`, *optional*, defaults to `model.config.eos_token_id`):
            The id of the *end-of-sequence* token.
        length_penalty (`float`, *optional*, defaults to `model.config.length_penalty` or 1.0 if the config does not set any value):
            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
            to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
            the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
            while `length_penalty` < 0.0 encourages shorter sequences.
        no_repeat_ngram_size (`int`, *optional*, defaults to `model.config.no_repeat_ngram_size` or 0 if the config does not set any value):
            If set to int > 0, all ngrams of that size can only occur once.
        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to `model.config.encoder_no_repeat_ngram_size` or 0 if the config does not set any value):
            If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
            `decoder_input_ids`.
        bad_words_ids(`List[List[int]]`, *optional*, defaults to `model.config.bad_words_ids`):
            List of token ids that are not allowed to be generated. In order to get the token ids of the words that
            should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True,
            add_special_tokens=False).input_ids`.
        force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*):
            List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple
            list of words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`,
            this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081),
            where one can allow different forms of each word.
        num_return_sequences(`int`, *optional*, defaults to `model.config.num_return_sequences` or 1 if the config does not set any value):
            The number of independently computed returned sequences for each element in the batch.
        max_time(`float`, *optional*):
            The maximum amount of time you allow the computation to run for in seconds. generation will still
            finish the current pass after allocated time has been passed.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens
            that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same shape
            as `input_ids` that masks the pad token. [What are attention masks?](../glossary#attention-mask)
        decoder_start_token_id (`int`, *optional*):
            If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should use the past last key/values attentions (if applicable to the model) to
            speed up decoding.
        num_beam_groups (`int`, *optional*, defaults to `model.config.num_beam_groups` or 1 if the config does not set any value):
            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
            beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
        diversity_penalty (`float`, *optional*, defaults to `model.config.diversity_penalty` or 0.0 if the config does not set any value):
            This value is subtracted from a beam's score if it generates a token same as any beam from other group
            at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is
            enabled.
        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
            If provided, this function constraints the beam search to allowed tokens only at each step. If not
            provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
            `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
            on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
            for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
            Retrieval](https://arxiv.org/abs/2010.00904).
        logits_processor (`LogitsProcessorList`, *optional*):
              Custom logits processors that complement the default logits processors built from arguments and a
              model's config. If a logit processor is passed that is already created with the arguments or a model's
              config an error is thrown. This feature is intended for advanced users.
        renormalize_logits: (`bool`, *optional*, defaults to `False`):
            Whether to renormalize the logits after applying all the logits processors or warpers (including the
            custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
            score logits are normalized but some logit processors or warpers break the normalization.
        stopping_criteria (`StoppingCriteriaList`, *optional*):
              Custom stopping criteria that complement the default stopping criteria built from arguments and a
              model's config. If a stopping criteria is passed that is already created with the arguments or a
              model's config an error is thrown. This feature is intended for advanced users.
        constraints (`List[Constraint]`, *optional*):
              Custom constraints that can be added to the generation to ensure that the output will contain the use
              of certain tokens as defined by `Constraint` objects, in the most sensible way possible.
        output_attentions (`bool`, *optional*, defaults to `model.config.output_attentions` or `False` if the config does not set any value):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more details.
        output_hidden_states (`bool`, *optional*, defaults to `model.config.output_hidden_states` or `False` if the config does not set any value):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
            for more details.
        output_scores (`bool`, *optional*, defaults to `model.config.output_scores` or `False` if the config does not set any value):
            Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
        return_dict_in_generate (`bool`, *optional*, defaults to `model.config.return_dict_in_generate` or `False` if the config does not set any value):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        forced_bos_token_id (`int`, *optional*, defaults to `model.config.forced_bos_token_id`):
            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful
            for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be
            the target language token.
        forced_eos_token_id (`int`, *optional*, defaults to `model.config.forced_eos_token_id`):
            The id of the token to force as the last generated token when `max_length` is reached.
        remove_invalid_values (`bool`, *optional*, defaults to `model.config.remove_invalid_values`):
            Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to
            crash. Note that using `remove_invalid_values` can slow down generation.
        synced_gpus (`bool`, *optional*, defaults to `False`):
            Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
        exponential_decay_length_penalty (`tuple(int, float)`, *optional*, defaults to `model.config.exponential_decay_length_penalty`):
            This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
            generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates
            where penalty starts and `decay_factor` represents the factor of exponential decay
        suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.suppress_tokens`):
            A list of tokens that will be supressed at generation. The `SupressTokens` logit processor will set
            their log probs to `-inf` so that they are not sampled.
        begin_suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.begin_suppress_tokens`):
            A list of tokens that will be supressed at the begining of the generation. The `SupressBeginTokens`
            logit processor will set their log probs to `-inf` so that they are not sampled.
        forced_decoder_ids (`List[int]`, *optional*, defaults to `model.config.forced_decoder_ids`):
            A list of tokens that will be forced as beginning tokens, before sampling.
        beam_scorer ('BeamScorer', *optional*)
            Scorer to use for beam search generation
        model_kwargs:
            Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model
            is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific kwargs
            should be prefixed with *decoder_*.
    Return:
        [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
        or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
            If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
            [`~utils.ModelOutput`] types are:
                - [`~generation_utils.GreedySearchDecoderOnlyOutput`],
                - [`~generation_utils.SampleDecoderOnlyOutput`],
                - [`~generation_utils.BeamSearchDecoderOnlyOutput`],
                - [`~generation_utils.BeamSampleDecoderOnlyOutput`]
            If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
            [`~utils.ModelOutput`] types are:
                - [`~generation_utils.GreedySearchEncoderDecoderOutput`],
                - [`~generation_utils.SampleEncoderDecoderOutput`],
                - [`~generation_utils.BeamSearchEncoderDecoderOutput`],
                - [`~generation_utils.BeamSampleEncoderDecoderOutput`]
"""
    # 0. Validate the `.generate()` call
    model._validate_model_class()
    model._validate_model_kwargs(model_kwargs.copy())

    # 1. Set generation parameters if not already defined
    bos_token_id = bos_token_id if bos_token_id is not None else model.config.bos_token_id
    num_beams = num_beams if num_beams is not None else model.config.num_beams
    length_penalty = length_penalty if length_penalty is not None else model.config.length_penalty
    early_stopping = early_stopping if early_stopping is not None else model.config.early_stopping
    num_beam_groups = num_beam_groups if num_beam_groups is not None else model.config.num_beam_groups
    num_return_sequences = (
        num_return_sequences if num_return_sequences is not None else model.config.num_return_sequences
    )

    pad_token_id = pad_token_id if pad_token_id is not None else model.config.pad_token_id
    eos_token_id = eos_token_id if eos_token_id is not None else model.config.eos_token_id

    if eos_token_id is None and hasattr(model.config, "decoder"):
        eos_token_id = model.config.decoder.eos_token_id

    if pad_token_id is None and eos_token_id is not None:
        pad_token_id = eos_token_id

    output_scores = output_scores if output_scores is not None else model.config.output_scores
    output_attentions = output_attentions if output_attentions is not None else model.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else model.config.output_hidden_states
    )
    return_dict_in_generate = (
        return_dict_in_generate if return_dict_in_generate is not None else model.config.return_dict_in_generate
    )

    # 2. Define model inputs
    # inputs_tensor has to be defined
    # model_input_name is defined if model-specific keyword input is passed
    # otherwise model_input_name is None
    # all model-specific keyword inputs are removed from `model_kwargs`
    inputs_tensor, model_input_name, model_kwargs = model._prepare_model_inputs(inputs, bos_token_id, model_kwargs)
    batch_size = inputs_tensor.shape[0]

    # 3. Define other model kwargs
    model_kwargs["output_attentions"] = output_attentions
    model_kwargs["output_hidden_states"] = output_hidden_states
    model_kwargs["use_cache"] = use_cache

    accepts_attention_mask = "attention_mask" in set(inspect.signature(model.forward).parameters.keys())
    requires_attention_mask = "encoder_outputs" not in model_kwargs

    if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask:
        model_kwargs["attention_mask"] = model._prepare_attention_mask_for_generation(
            inputs_tensor, pad_token_id, eos_token_id
        )

    # decoder-only models should use left-padding for generation
    if not model.config.is_encoder_decoder:
        if pad_token_id is not None and torch.sum(inputs_tensor[:, -1] == pad_token_id) > 0:
            logger.warning(
                "A decoder-only architecture is being used, but right-padding was detected! For correct "
                "generation results, please set `padding_side='left'` when initializing the tokenizer."
            )

    if model.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
        # if model is encoder decoder encoder_outputs are created
        # and added to `model_kwargs`
        model_kwargs = model._prepare_encoder_decoder_kwargs_for_generation(
            inputs_tensor, model_kwargs, model_input_name
        )

    # 4. Prepare `input_ids` which will be used for auto-regressive generation
    if model.config.is_encoder_decoder:
        input_ids = model._prepare_decoder_input_ids_for_generation(
            batch_size,
            decoder_start_token_id=decoder_start_token_id,
            bos_token_id=bos_token_id,
            model_kwargs=model_kwargs,
            device=inputs_tensor.device,
        )
    else:
        # if decoder-only then inputs_tensor has to be `input_ids`
        input_ids = inputs_tensor

    # 5. Prepare `max_length` depending on other stopping criteria.
    input_ids_seq_length = input_ids.shape[-1]
    if max_length is None and max_new_tokens is None:
        warnings.warn(
            "Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to "
            f"{self.config.max_length} (`self.config.max_length`). Controlling `max_length` via the config is "
            "deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend "
            "using `max_new_tokens` to control the maximum length of the generation.",
            UserWarning,
        )
    elif max_length is None and max_new_tokens is not None:
        max_length = max_new_tokens + input_ids_seq_length
    elif max_length is not None and max_new_tokens is not None:
        raise ValueError(
            "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
            " limit to the generated output length. Remove one of those arguments. Please refer to the"
            " documentation for more information. "
            "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
        )
    # default to config if still None
    max_length = max_length if max_length is not None else model.config.max_length
    min_length = min_length if min_length is not None else model.config.min_length

    if min_length is not None and min_length > max_length:
        raise ValueError(
            f"Unfeasible length constraints: the minimum length ({min_length}) is larger than the maximum "
            f"length ({max_length})"
        )
    if input_ids_seq_length >= max_length:
        print(input_ids_seq_length)
        print(max_length)
        input_ids_string = "decoder_input_ids" if model.config.is_encoder_decoder else "input_ids"
        logger.warning(
            f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
            f" {max_length}. This can lead to unexpected behavior. You should consider increasing "
            "`max_new_tokens`."
        )

    # 7. prepare distribution pre_processing samplers
    logits_processor = model._get_logits_processor(
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
        input_ids_seq_length=input_ids_seq_length,
        encoder_input_ids=inputs_tensor,
        bad_words_ids=bad_words_ids,
        min_length=min_length,
        max_length=max_length,
        eos_token_id=eos_token_id,
        forced_bos_token_id=forced_bos_token_id,
        forced_eos_token_id=forced_eos_token_id,
        prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
        num_beams=num_beams,
        num_beam_groups=num_beam_groups,
        diversity_penalty=diversity_penalty,
        remove_invalid_values=remove_invalid_values,
        exponential_decay_length_penalty=exponential_decay_length_penalty,
        logits_processor=logits_processor,
        renormalize_logits=renormalize_logits,
        suppress_tokens=suppress_tokens,
        begin_suppress_tokens=begin_suppress_tokens,
        forced_decoder_ids=forced_decoder_ids,
    )

    # 8. prepare stopping criteria
    stopping_criteria = model._get_stopping_criteria(
        max_length=max_length, max_time=max_time, stopping_criteria=stopping_criteria
    )

    if num_return_sequences > num_beams:
        raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")

    if stopping_criteria.max_length is None:
        raise ValueError("`max_length` needs to be a stopping_criteria for now.")

    # 10. prepare beam search scorer
    if beam_scorer == None:
        beam_scorer = BeamSearchScorer(
            batch_size=batch_size,
            num_beams=num_beams,
            device=inputs_tensor.device,
            length_penalty=length_penalty,
            do_early_stopping=early_stopping,
            num_beam_hyps_to_keep=num_return_sequences,
        )
    # 11. interleave input_ids with `num_beams` additional sequences per batch
    input_ids, model_kwargs = model._expand_inputs_for_generation(
        input_ids, expand_size=num_beams, is_encoder_decoder=model.config.is_encoder_decoder, **model_kwargs
    )

    # 12. run beam search
    return model.beam_search(
        input_ids,
        beam_scorer,
        logits_processor=logits_processor,
        stopping_criteria=stopping_criteria,
        pad_token_id=pad_token_id,
        eos_token_id=eos_token_id,
        output_scores=output_scores,
        return_dict_in_generate=return_dict_in_generate,
        synced_gpus=synced_gpus,
        **model_kwargs,
    )

END: COPIED FROM https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/generation/utils.py

In [13]:
from transformers import GPT2LMHeadModel
gpt_model = GPT2LMHeadModel.from_pretrained("1.5B", torch_dtype=torch.float16, low_cpu_mem_usage=True)

In [None]:
# CodeT5 Beam Search Generations
model.model.eval()
model.model.to(device)
loader = data.DataLoader(test_data, batch_size=1, num_workers=4)
num_beams=4
new_line_token = tokenizer.encode("\n", add_special_tokens=False)[0]
loop_tokens = [tokenizer.encode("for", add_special_tokens=False)[0], tokenizer.encode("while", add_special_tokens=False)[0]]
output_codes = {}
lambdas = [0, 0.25, 0.75]
gammas = [0, 1, 2]
for l in lambdas:
    for g in gammas:
        for batch_idx, batch in enumerate(loader):
            print(batch_idx)
            if batch_idx >= 400:
                break
            beam_scorer = CodeBeamSearchScorer(
              batch_size = 1,
              num_beams = num_beams,
              num_beam_hyps_to_keep = num_beams,
              lambd = l,
              gamma = g,
              device=model.model.device,
              new_line_token = new_line_token,
              loop_tokens = loop_tokens
            )
            output_codes[batch_idx] = []
            outs = generate_beam_search(model.model, batch["source_ids"].to(device), 
                                          attention_mask=batch['source_mask'].to(device), 
                                          max_length=400, num_beams=num_beams, beam_scorer=beam_scorer)

            dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
            for entry in dec:
              output_codes[batch_idx].append(entry)

        with open("results/all_codes_nl_interview_lambda_"+ str(l).replace(".","") +"_gamma_" +str(g).replace(".","")+ ".json", "w") as outfile:
            output_json = json.dumps(output_codes)
            outfile.write(output_json)

In [17]:
#CodeT5 Nucleus sampling generations
model.model.eval()
model.model.to(device)
loader = data.DataLoader(test_data, batch_size=1, num_workers=4)
output_codes = {}
for batch_idx, batch in enumerate(loader):
    if batch_idx >= 400:
        break
    outs = model.model.generate(
                batch["source_ids"].to(device), 
                attention_mask=batch['source_mask'].to(device),  
                do_sample=True, 
                max_length=400, 
                top_p=0.95, 
                num_return_sequences = 6,
            )
    output_codes[batch_idx] = []
    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
    for entry in dec:
      output_codes[batch_idx].append(entry.split("ANSWER:\n")[-1])

with open("results/all_codes_nucleus_interview_095.json", "w") as outfile:
    output_json = json.dumps(output_codes)
    outfile.write(output_json)

In [None]:
#GPT2 Beam Search Generations
gpt_model.eval()
gpt_model.to(device=device, dtype=torch.float16, non_blocking=False, memory_format=torch.preserve_format)
loader = data.DataLoader(gpt_test_data, batch_size=1, num_workers=2)
num_beams=4
loop_tokens = [gpt_tokenizer.encode("for", add_special_tokens=False)[0], gpt_tokenizer.encode("while", add_special_tokens=False)[0]]
new_line_token = gpt_tokenizer.encode("\n", add_special_tokens=False)[0]
lambd_vals = [0, 0.25, 0.75]
gamma_vals = [0, 1, 2]

for l in lambd_vals:
    for g in gamma_vals:
        output_codes = {}
        for batch_idx, batch in enumerate(loader):
            if batch_idx >= 400:
                break
            beam_scorer = CodeBeamSearchScorer(
              batch_size = 1,
              num_beams = num_beams,
              num_beam_hyps_to_keep = num_beams,
              lambd = l,
              gamma = g,
              device=gpt_model.device,
              new_line_token = new_line_token,
              loop_tokens = loop_tokens
            )
            print(batch_idx)
            output_codes[batch_idx] = []
            outs = generate_beam_search(gpt_model, batch["source_ids"][0].to(device), 
                                          max_length=400 + len(batch["source_ids"][0][0].to(device)), num_beams=num_beams, beam_scorer=beam_scorer)

            dec = [gpt_tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
            for entry in dec:
              output_codes[batch_idx].append(entry.split("ANSWER:\n")[-1])

            del batch
            del beam_scorer
            del outs
            torch.cuda.empty_cache()

        with open("results/all_codes_gpt2_nl_interview_lambda_"+ str(l).replace(".","") +"_gamma_" +str(g).replace(".","")+ ".json", "w") as outfile:
            output_json = json.dumps(output_codes)
            outfile.write(output_json)

In [None]:
#GPT2 Nucleus sampling generations
gpt_model.eval()
gpt_model.to(device)
loader = data.DataLoader(gpt_test_data, batch_size=1, num_workers=4)
output_codes = {}
for batch_idx, batch in enumerate(loader):
    if batch_idx >= 400:
        break
    outs = gpt_model.generate(
                batch["source_ids"][0].to(device),  
                do_sample=True, 
                max_length=400 + len(batch["source_ids"][0][0].to(device)), 
                top_p=0.95, 
                num_return_sequences = 6,
            )
    output_codes[batch_idx] = []
    dec = [gpt_tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
    for entry in dec:
      output_codes[batch_idx].append(entry.split("ANSWER:\n")[-1])

with open("results/all_codes_gpt2_nucleus_interview_095.json", "w") as outfile:
    output_json = json.dumps(output_codes)
    outfile.write(output_json)

In [21]:
def read_constraints(file_name):
    cons_list = []
    with open(file_name, 'r') as f:
        for line in f:
            cons = []
            for concept in json.loads(line):
                cons.append([f' {c}' for c in concept])
            cons_list.append(cons)
    return cons_list

In [None]:
#GPT2 Neurologic A* Ablation Over Heuristic Hyperparameter
from neurologic_astar.generate import generate
from neurologic_astar.utils import tokenize_constraints
from neurologic_astar.lexical_constraints import init_batch

gpt_model.eval()
gpt_model.to(device=device, dtype=torch.float16, non_blocking=False, memory_format=torch.preserve_format)
loader = data.DataLoader(gpt_test_data, batch_size=1, num_workers=2)
loop_tokens = [gpt_tokenizer.encode("for", add_special_tokens=False)[0], gpt_tokenizer.encode("while", add_special_tokens=False)[0]]
num_beams=4
new_line_token = gpt_tokenizer.encode("\n", add_special_tokens=False)[0]
alpha_vals = [.1, .25, .5, .75]

for val in alpha_vals:
    output_codes = {}
    for batch_idx, batch in enumerate(loader):
        if batch_idx >= 75:
            break
        print(batch_idx)
        output_codes[batch_idx] = []

        constraints_list = read_constraints("neurologic_astar/dummy_constraint.json")
        constraints_list = tokenize_constraints(gpt_tokenizer, constraints_list)
        key_constraints_list = constraints_list
        
        eos_ids = [gpt_tokenizer.eos_token_id]

        constraints = init_batch(raw_constraints=constraints_list,
                         key_constraints=key_constraints_list,
                         beam_size=num_beams,
                         eos_id=eos_ids)

        outs = generate(gpt_model, batch["source_ids"][0].to(device), 
                                      max_length=400 + len(batch["source_ids"][0][0].to(device)),
                                      num_beams=num_beams, lambd=0, gamma=0, new_line_token = new_line_token,
                                      loop_tokens=loop_tokens, look_ahead_step=5, look_ahead_width=1, constraints=constraints,
                                      prune_factor=50, alpha=val, sat_tolerance=0,num_return_sequences=num_beams)

        dec = [gpt_tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        for entry in dec:
          output_codes[batch_idx].append(entry.split("ANSWER:\n")[-1])

        del batch
        del outs
        torch.cuda.empty_cache()
        
    with open("results/all_codes_gpt2_astar_nl_interview_ablation_"+ str(val).replace(".","") +".json", "w") as outfile:
        output_json = json.dumps(output_codes)
        outfile.write(output_json)

In [8]:
results_list = ["all_codes_gpt2_astar_nl_interview_ablation_01_results.json", "all_codes_gpt2_astar_nl_interview_ablation_025_results.json", "all_codes_gpt2_astar_nl_interview_ablation_05_results.json",
              "all_codes_gpt2_astar_nl_interview_ablation_075_results.json"]

codes_list = ["all_codes_gpt2_astar_nl_interview_ablation_01.json", "all_codes_gpt2_astar_nl_interview_ablation_025.json", "all_codes_gpt2_astar_nl_interview_ablation_05.json",
              "all_codes_gpt2_astar_nl_interview_ablation_075.json"]

dirs = []
with open("test_dirs.json", "r") as apps_file:
    dirs = json.load(apps_file)
    
for y in range(len(results_list)):
    result = results_list[y]
    code = codes_list[y]
    with open('results/' + result, 'r') as file:
        with open ('results/' + code, 'r') as code_file:
            data = json.load(file)
            code_data = json.load(code_file)
            total_tests = 0
            total_evals = 0
            total_runtime_err = 0
            total_compile_err = 0
            i=0
            for key in data:
                with open("APPS/test/" + dirs[i] + "/input_output.json", "r") as curr_file:
                    test_info = json.load(curr_file)
                    total_tests += len(test_info["inputs"])
                    total_evals += sum([len(x) for x in data[key]])


                    total_runtime_err += sum([x.count(-1) for x in data[key]])
                    total_compile_err += sum([x.count(-2) for x in data[key]])

                i += 1

            print(result)
            print("-- % Runtime Errors: " + str(total_runtime_err/(total_evals)))
            print("-- % Compile Errors: " + str(total_compile_err/(total_evals)))
            
            for beam in [1,2,4]:
                total_passed = 0
                total_nls = []
                for key in data:
                    seqs = data[key][:beam]
                    true_counts = [x.count(True) for x in seqs]
                    max_true = max(true_counts)
                    best_code = code_data[key][true_counts.index(max_true)]
                    total_nls.append(best_code.count("\n"))
                    total_passed += max_true
                print("-- Pass @ " + str(beam) + ": " + str(total_passed/total_tests))
                print("----- Avg lines: " + str(sum(total_nls)/len(total_nls)))
        

all_codes_gpt2_astar_nl_interview_ablation_01_results.json
-- % Runtime Errors: 0.8777609682299546
-- % Compile Errors: 0.0007060010085728693
-- Pass @ 1: 0.006921099466086613
----- Avg lines: 13.56
-- Pass @ 2: 0.007118845165117659
----- Avg lines: 13.546666666666667
-- Pass @ 4: 0.007712082262210797
----- Avg lines: 13.586666666666666
all_codes_gpt2_astar_nl_interview_ablation_025_results.json
-- % Runtime Errors: 0.8773575491467918
-- % Compile Errors: 0.0006985330805308851
-- Pass @ 1: 0.006921099466086613
----- Avg lines: 13.56
-- Pass @ 2: 0.007118845165117659
----- Avg lines: 13.546666666666667
-- Pass @ 4: 0.007712082262210797
----- Avg lines: 13.586666666666666
all_codes_gpt2_astar_nl_interview_ablation_05_results.json
-- % Runtime Errors: 0.8773575491467918
-- % Compile Errors: 0.0006985330805308851
-- Pass @ 1: 0.006921099466086613
----- Avg lines: 13.56
-- Pass @ 2: 0.007118845165117659
----- Avg lines: 13.546666666666667
-- Pass @ 4: 0.007712082262210797
----- Avg lines: 1

In [None]:
#GPT2 Neurologic A* Beam Search Generations
from neurologic_astar.generate import generate
from neurologic_astar.utils import tokenize_constraints
from neurologic_astar.lexical_constraints import init_batch

gpt_model.eval()
gpt_model.to(device=device, dtype=torch.float16, non_blocking=False, memory_format=torch.preserve_format)
loader = data.DataLoader(gpt_test_data, batch_size=1, num_workers=2)
num_beams=4
loop_tokens = [gpt_tokenizer.encode("for", add_special_tokens=False)[0], gpt_tokenizer.encode("while", add_special_tokens=False)[0]]
new_line_token = gpt_tokenizer.encode("\n", add_special_tokens=False)[0]
lambd_vals = [0, 0.25, 0.75]
gamma_vals = [0, 1, 2]

for l in lambd_vals:
    for g in gamma_vals:
        output_codes = {}
        for batch_idx, batch in enumerate(loader):
            if batch_idx >= 400:
                break
            print(batch_idx)
            output_codes[batch_idx] = []

            constraints_list = read_constraints("neurologic_astar/dummy_constraint.json")
            constraints_list = tokenize_constraints(gpt_tokenizer, constraints_list)
            key_constraints_list = constraints_list

            eos_ids = [gpt_tokenizer.eos_token_id]

            constraints = init_batch(raw_constraints=constraints_list,
                             key_constraints=key_constraints_list,
                             beam_size=num_beams,
                             eos_id=eos_ids)

            outs = generate(gpt_model, batch["source_ids"][0].to(device), 
                                          max_length=400 + len(batch["source_ids"][0][0].to(device)),
                                          num_beams=num_beams, lambd=l, gamma=g, new_line_token = new_line_token,
                                          loop_tokens=loop_tokens, look_ahead_step=5, look_ahead_width=1, constraints=constraints,
                                          prune_factor=50, alpha=.5, sat_tolerance=0,num_return_sequences=num_beams)

            dec = [gpt_tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
            for entry in dec:
              output_codes[batch_idx].append(entry.split("ANSWER:\n")[-1])

            del batch
            del outs
            del constraints
            torch.cuda.empty_cache()

        with open("results/all_codes_gpt2_astar_nl_interview_lambda_"+ str(l).replace(".","") +"_gamma_" +str(g).replace(".","")+ ".json", "w") as outfile:
            output_json = json.dumps(output_codes)
            outfile.write(output_json)

In [None]:
#Save json file containing directory names of test data to be utilized
import json
test_dirs = []
for sample in test_data.data:
    test_dirs.append(str(sample["problem_id"]).zfill(4))
json_test_dirs = json.dumps(test_dirs)
with open("test_dirs.json", "w") as outfile:
    outfile.write(json_test_dirs)

In [None]:
#Run test cases for solutions provided in file list and save results to json files
import testing_util
import test_one_solution
import argparse

file_list = ["all_codes_nl_interview_0", "all_codes_nl_interview_025", "all_codes_nl_interview_075", "all_codes_nucleus_interview_095",
              "all_codes_gpt2_nl_interview_0", "all_codes_gpt2_nl_interview_025", "all_codes_gpt2_nl_interview_075", "all_codes_gpt2_nucleus_interview_095",
              "all_codes_gpt2_astar_nl_interview_0", "all_codes_gpt2_astar_nl_interview_025", "all_codes_gpt2_astar_nl_interview_075",
              "all_codes_nl_interview_lambda_0_gamma_1", "all_codes_nl_interview_lambda_0_gamma_2", "all_codes_gpt2_nl_interview_lambda_0_gamma_1",
              "all_codes_gpt2_nl_interview_lambda_0_gamma_2", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_1", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_2"]
for f in file_list:
    parser = argparse.ArgumentParser(description="Testing a Language Model on Python Code")
    parser.add_argument("-t","--test_loc", default="../data_split/test.json", type=str, help="path to the json containing problem paths to be evaluated.")
    parser.add_argument("-r","--root", default="../", type=str, help="where the data is stored.")
    parser.add_argument("-s","--start", default=0, type=int)
    parser.add_argument("-e","--end", default=None, type=int, help="If you want to evaluate a subset of problems specify start and ending index. File with start and ending prefix must exist typically used with batch evaluation.")
    parser.add_argument("-i", "--index", default=0, type=int)
    parser.add_argument("-p", "--print_results", action="store_true", help="If you have already evaluated the results and only want to print them.")
    parser.add_argument("-d", "--debug", action="store_true")
    parser.add_argument("--save", type=str, default="./results", help="Where the evaluated data is loaded from and results saved to.")
    parser.add_argument("--stop-early", default=None, type=int)
    parser.add_argument("-c", "--codes_file", default="all_codes.json", type=str)
    parser.add_argument("-f", "--save_file", default="all_results.json", type=str)
    
    code_f = f + ".json"
    save_f = f + "_results.json"
    args = parser.parse_args(['--test_loc=test_dirs.json', '--root=APPS/test','--end=400', '--save=results', '--codes_file='+code_f, '--save_file='+save_f])
    try:
        test_one_solution.main(args)
    except:
        continue

In [14]:
#Gather summary statistics for model generated solutions

#List of all json files containing test case results
results_list = ["all_codes_nl_interview_0_results.json", "all_codes_nl_interview_025_results.json", "all_codes_nl_interview_075_results.json", "all_codes_nucleus_interview_095_results.json",
                "all_codes_gpt2_nl_interview_0_results.json", "all_codes_gpt2_nl_interview_025_results.json", "all_codes_gpt2_nl_interview_075_results.json", "all_codes_gpt2_nucleus_interview_095_results.json",
                "all_codes_gpt2_astar_nl_interview_0_results.json", "all_codes_gpt2_astar_nl_interview_025_results.json", "all_codes_gpt2_astar_nl_interview_075_results.json",
                "all_codes_nl_interview_lambda_0_gamma_1_results.json", "all_codes_nl_interview_lambda_0_gamma_2_results.json", "all_codes_gpt2_nl_interview_lambda_0_gamma_1_results.json",
                "all_codes_gpt2_nl_interview_lambda_0_gamma_2_results.json", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_1_results.json", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_2_results.json"]

#List of all json files containing model generated programs
codes_list = ["all_codes_nl_interview_0.json", "all_codes_nl_interview_025.json", "all_codes_nl_interview_075.json", "all_codes_nucleus_interview_095.json",
              "all_codes_gpt2_nl_interview_0.json", "all_codes_gpt2_nl_interview_025.json", "all_codes_gpt2_nl_interview_075.json", "all_codes_gpt2_nucleus_interview_095.json",
              "all_codes_gpt2_astar_nl_interview_0.json", "all_codes_gpt2_astar_nl_interview_025.json", "all_codes_gpt2_astar_nl_interview_075.json",
              "all_codes_nl_interview_lambda_0_gamma_1.json", "all_codes_nl_interview_lambda_0_gamma_2.json", "all_codes_gpt2_nl_interview_lambda_0_gamma_1.json",
              "all_codes_gpt2_nl_interview_lambda_0_gamma_2.json", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_1.json", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_2.json"]

dirs = []
with open("test_dirs.json", "r") as apps_file:
    dirs = json.load(apps_file)
    
for y in range(len(results_list)):
    result = results_list[y]
    code = codes_list[y]
    with open('results/' + result, 'r') as file:
        with open ('results/' + code, 'r') as code_file:
            data = json.load(file)
            code_data = json.load(code_file)
            total_tests = 0
            total_evals = 0
            total_runtime_err = 0
            total_compile_err = 0
            i=0
            for key in data:
                with open("APPS/test/" + dirs[i] + "/input_output.json", "r") as curr_file:
                    test_info = json.load(curr_file)
                    total_tests += len(test_info["inputs"])
                    total_evals += sum([len(x) for x in data[key]])


                    total_runtime_err += sum([x.count(-1) for x in data[key]])
                    total_compile_err += sum([x.count(-2) for x in data[key]])

                i += 1

            print(result)
            #Print percentage of runtime and compile errors from current json file
            print("-- % Runtime Errors: " + str(total_runtime_err/(total_evals)))
            print("-- % Compile Errors: " + str(total_compile_err/(total_evals)))
            
            #Calculate the percentage of test cases passed, as well as avg number of lines and loops
            #for candidate solution sets of size 1,2,4
            for beam in [1,2,4]:
                total_passed = 0
                total_nls = []
                total_loops = []
                for key in data:
                    seqs = data[key][:beam]
                    true_counts = [x.count(True) for x in seqs]
                    max_true = max(true_counts)
                    best_code = code_data[key][true_counts.index(max_true)]
                    total_nls.append(best_code.count("\n"))
                    total_loops.append(best_code.count("for") + best_code.count("while"))
                    total_passed += max_true
                print("-- Pass @ " + str(beam) + ": " + str(total_passed/total_tests))
                print("----- Avg lines: " + str(sum(total_nls)/len(total_nls)))
                print("----- Avg loops: " + str(sum(total_loops)/len(total_loops)))

            print("\n********************\n")

all_codes_nl_interview_0_results.json
-- % Runtime Errors: 0.9194505878244674
-- % Compile Errors: 0.005587242463042719
-- Pass @ 1: 0.004188753382511028
----- Avg lines: 13.1325
----- Avg loops: 1.175
-- Pass @ 2: 0.004893057048596953
----- Avg lines: 13.1325
----- Avg loops: 1.175
-- Pass @ 4: 0.008970604589094414
----- Avg lines: 13.1375
----- Avg loops: 1.175

********************

all_codes_nl_interview_025_results.json
-- % Runtime Errors: 0.9091266277503368
-- % Compile Errors: 0.0034238886394252358
-- Pass @ 1: 0.005189605960633132
----- Avg lines: 6.3525
----- Avg loops: 0.5575
-- Pass @ 2: 0.007117173888868295
----- Avg lines: 6.3525
----- Avg loops: 0.5575
-- Pass @ 4: 0.010416280535270786
----- Avg lines: 6.3725
----- Avg loops: 0.56

********************

all_codes_nl_interview_075_results.json
-- % Runtime Errors: 0.9095208776446883
-- % Compile Errors: 0.0032743759095488637
-- Pass @ 1: 0.005189605960633132
----- Avg lines: 6.295
----- Avg loops: 0.5475
-- Pass @ 2: 0.00

In [11]:
#Calculate the CodeBLEU scores for generated solutions, using sample solutions from APPS data as references
import CodeBLEU.bleu as bleu
import CodeBLEU.weighted_ngram_match as weighted_ngram_match
import CodeBLEU.syntax_match as syntax_match
import CodeBLEU.dataflow_match as dataflow_match
import json

lang = "python"
alpha,beta,gamma,theta = [0.1, 0.1, 0.4, 0.4]

codes_list = ["all_codes_nl_interview_0.json", "all_codes_nl_interview_025.json", "all_codes_nl_interview_075.json", "all_codes_nucleus_interview_095.json",
              "all_codes_gpt2_nl_interview_0.json", "all_codes_gpt2_nl_interview_025.json", "all_codes_gpt2_nl_interview_075.json", "all_codes_gpt2_nucleus_interview_095.json",
              "all_codes_gpt2_astar_nl_interview_0.json", "all_codes_gpt2_astar_nl_interview_025.json", "all_codes_gpt2_astar_nl_interview_075.json",
              "all_codes_nl_interview_lambda_0_gamma_1.json", "all_codes_nl_interview_lambda_0_gamma_2.json", "all_codes_gpt2_nl_interview_lambda_0_gamma_1.json",
              "all_codes_gpt2_nl_interview_lambda_0_gamma_2.json", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_1.json", "all_codes_gpt2_astar_nl_interview_lambda_0_gamma_2.json"]

for code in codes_list:
    with open("test_dirs.json") as test_json:
        with open("results/" + code) as code_json:
            json_data = json.load(test_json)
            code_data = json.load(code_json)
            
            best_scores = 0
            total_tests = 400
            for i in range(400):
                with open("APPS/test/" + json_data[i] + "/solutions.json") as sol_json:
                    solution_data = json.load(sol_json)
                    pre_references = [[reference] \
                                    for reference in solution_data]
                    generations = code_data[str(i)]
                    
                    if len(solution_data) == 0:
                        total_tests -= 1
                        continue
                    
                    best_score = 0
                    for j in range(4):
                        hypothesis = [generations[j]]
                        
                        references = []
                        for i in range(len(hypothesis)):
                            ref_for_instance = []
                            for j in range(len(pre_references)):
                                ref_for_instance.append(pre_references[j][i])
                            references.append(ref_for_instance)


                        # calculate ngram match (BLEU)
                        tokenized_hyps = [x.split() for x in hypothesis]
                        tokenized_refs = [[x.split() for x in reference] for reference in references]

                        ngram_match_score = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)

                        # calculate weighted ngram match
                        keywords = [x.strip() for x in open('CodeBLEU/keywords/'+lang+'.txt', 'r', encoding='utf-8').readlines()]
                        def make_weights(reference_tokens, key_word_list):
                            return {token:1 if token in key_word_list else 0.2 \
                                    for token in reference_tokens}
                        tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
                                    for reference_tokens in reference] for reference in tokenized_refs]
                        weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights,tokenized_hyps)

                        # calculate syntax match
                        syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang)

                        # calculate dataflow match
                        dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang)

                        code_bleu_score = alpha*ngram_match_score\
                                        + beta*weighted_ngram_match_score\
                                        + gamma*syntax_match_score\
                                        + theta*dataflow_match_score
                        if code_bleu_score > best_score:
                            best_score = code_bleu_score
                            
                    best_scores += best_score
                    
            print(code + ": " + str(best_scores/total_tests))
                
                        
                    
        

all_codes_nl_interview_0.json: 0.2373582374276232
all_codes_nl_interview_025.json: 0.2346343947542429
all_codes_nl_interview_075.json: 0.23406250887693003
all_codes_nucleus_interview_095.json: 0.2885924421515881
all_codes_gpt2_nl_interview_0.json: 0.2901704285958109
all_codes_gpt2_nl_interview_025.json: 0.2901704285958109
all_codes_gpt2_nl_interview_075.json: 0.2901704285958109
all_codes_gpt2_nucleus_interview_095.json: 0.31954536236792686
all_codes_gpt2_astar_nl_interview_0.json: 0.2906287576009002
all_codes_gpt2_astar_nl_interview_025.json: 0.2894565783988229
all_codes_gpt2_astar_nl_interview_075.json: 0.2894326501847852
all_codes_nl_interview_lambda_0_gamma_1.json: 0.23522113290925048
all_codes_nl_interview_lambda_0_gamma_2.json: 0.2351692226766906
all_codes_gpt2_nl_interview_lambda_0_gamma_1.json: 0.2889940632644226
all_codes_gpt2_nl_interview_lambda_0_gamma_2.json: 0.2889250361321371
all_codes_gpt2_astar_nl_interview_lambda_0_gamma_1.json: 0.289503791498633
all_codes_gpt2_astar_nl

In [None]:
#Generate sample programs from CodeT5 and GPT-2
from neurologic_astar.generate import generate
from neurologic_astar.utils import tokenize_constraints
from neurologic_astar.lexical_constraints import init_batch

model.model.eval()
model.model.to(device)

gpt_model.eval()
gpt_model.to(device=device, dtype=torch.float16, non_blocking=False, memory_format=torch.preserve_format)

loader = data.DataLoader(test_data, batch_size=1, num_workers=4)
gpt_loader = data.DataLoader(gpt_test_data, batch_size=1, num_workers=2)

new_line_token = tokenizer.encode("\n", add_special_tokens=False)[0]
loop_tokens = [tokenizer.encode("for", add_special_tokens=False)[0], tokenizer.encode("while", add_special_tokens=False)[0]]

gpt_loop_tokens = [gpt_tokenizer.encode("for", add_special_tokens=False)[0], gpt_tokenizer.encode("while", add_special_tokens=False)[0]]
gpt_new_line_token = gpt_tokenizer.encode("\n", add_special_tokens=False)[0]

codet5_sample = None
gpt2_sample_beam = None
gpt2_sample_astar = None

for batch_idx, batch in enumerate(loader):
    if batch_idx == 318:
        
        outs = model.model.generate(
                    batch["source_ids"].to(device), 
                    attention_mask=batch['source_mask'].to(device),  
                    do_sample=True, 
                    max_length=400, 
                    top_p=0.95, 
                    num_return_sequences = 1,
                )


        dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        codet5_sample = dec[0]               
        break
        
for batch_idx, batch in enumerate(gpt_loader):
    if batch_idx == 318:
        constraints_list = read_constraints("neurologic_astar/dummy_constraint.json")
        constraints_list = tokenize_constraints(gpt_tokenizer, constraints_list)
        key_constraints_list = constraints_list

        eos_ids = [gpt_tokenizer.eos_token_id]

        constraints = init_batch(raw_constraints=constraints_list,
                         key_constraints=key_constraints_list,
                         beam_size=4,
                         eos_id=eos_ids)

        outs = generate(gpt_model, batch["source_ids"][0].to(device), 
                                      max_length=400 + len(batch["source_ids"][0][0].to(device)),
                                      num_beams=4, lambd=0.75, gamma=0, new_line_token = gpt_new_line_token,
                                      loop_tokens=gpt_loop_tokens, look_ahead_step=5, look_ahead_width=1, constraints=constraints,
                                      prune_factor=50, alpha=.5, sat_tolerance=0,num_return_sequences=4)
        
        beam_scorer = CodeBeamSearchScorer(
              batch_size = 1,
              num_beams = 4,
              num_beam_hyps_to_keep = 4,
              lambd = 0,
              gamma = 0,
              device=gpt_model.device,
              new_line_token = gpt_new_line_token,
              loop_tokens = gpt_loop_tokens
            )

        outs_beam = generate_beam_search(gpt_model, batch["source_ids"][0].to(device), 
                                      max_length=400 + len(batch["source_ids"][0][0].to(device)), num_beams=4, beam_scorer=beam_scorer)

        dec = [gpt_tokenizer.decode(ids, skip_special_tokens=True).split("ANSWER:\n")[-1] for ids in outs]
        dec_beam = [gpt_tokenizer.decode(ids, skip_special_tokens=True).split("ANSWER:\n")[-1] for ids in outs_beam]
        
        gpt2_sample_astar = dec[3]   
        gpt2_sample_beam = dec_beam[3]
        break

In [27]:
print(gpt2_sample_beam)

import sys

def main():
	n, m = map(int, sys.stdin.readline().split())
	ww = [list(map(int, sys.stdin.readline().split())) for _ in range(n)]
	ww.sort(key=lambda x: x[0], reverse=True)
	m = ww[0][0]
	ans = 0
	for w, c in ww:
		if c > m:
			break
		ans += c
	print(ans)


def __starting_point():
	main()

__starting_point()



In [28]:
print(gpt2_sample_astar)

import sys

def main():
	n, m = map(int, sys.stdin.readline().split())
	ww = [list(map(int, sys.stdin.readline().split())) for _ in range(n)]
	ww.sort(key=lambda x: x[0], reverse=True)
	ans = 0
	for w, c in ww:
		if c > m:
			break
		ans += c
	print(ans)

main()



In [29]:
print(codet5_sample)

n, m = map(int, input().split())
a = [int(x) for x in input().split()]
d = [[0] * n for x in range(m)]
b = [0] * n
d[0][n] = sum(a)
#print(d[1])
#print(d[1][0])
for i in range(1, n+1):
    a[i][i] = max(d[i][0], a[i-1][i])
for i in range(n):
    a[i][i] += d[i-1][i-1]
print(b[n][m])
