# Infilling with Incoder

*Authored by Arjun Guha with some guidance from Carolyn Jane Anderson and Daniel Fried.*
 
This is not a BigCode model, but BigCode does infilling similar to Incoder.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

Left-side padding is essential for batching.

In [2]:
DEVICE = "cuda"
MODEL_NAME = "facebook/incoder-6B"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")
tokenizer.add_special_tokens({
    "pad_token": "<pad>"
})

0

In [3]:
def strip_left_padding(output_tensor):
    """
    Since we are not using skip_special_tokens, when batching results of varying length,
    the output will contain <pad> tokens on the left. This code strips those out. It also strips out
    the <|endoftext|> token that marks the begining of strings.
    """
    start_index = 0
    while output_tensor[start_index].item() == tokenizer.pad_token_id or output_tensor[start_index].item() == 2:
        start_index += 1
    return output_tensor[start_index:]

def extract_fim_part(s: str, prompt):
    """
    This skips the prompt and extracts code up to <|endofmask|> or the end of string.
    """
    stop_index = s.find("<|endofmask|>")
    if stop_index == -1:
        stop_index = len(s)
    return s[len(prompt):stop_index]

def infill(prefix_suffix_tuples, max_tokens: int = 50, temperature: float = 0.2, top_p : float = 0.95):
    if type(prefix_suffix_tuples) == tuple:
        prefix_suffix_tuples = [prefix_suffix_tuples]
        
    prompts = [f"{prefix}<|mask:0|>{suffix}<|mask:1|><|mask:0|>" for prefix, suffix in prefix_suffix_tuples]
    # `return_token_type_ids=False` is essential, or we get nonsense output.
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, return_token_type_ids=False).to(DEVICE)
    max_length = inputs.input_ids[0].size(0) + max_tokens
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            do_sample=True,
            top_p=top_p,
            temperature=temperature,
            max_length=max_length
        )
    # WARNING: cannot use skip_special_tokens, because it blows away the FIM special tokens.
    return [        
        extract_fim_part(tokenizer.decode(strip_left_padding(tensor), clean_up_tokenization_spaces=False, skip_special_tokens=False), prompt) 
        for (tensor, prompt) in zip(outputs, prompts)
    ]

In the example below, we infill a batch with two problems:
1.  The return type of factorial, and
2.  A problem from OpenAI HumanEval.

In [10]:
prefix = """def fac(n) -> """

suffix = """:
    if n == 0:
        return 1
    else:
        return n * fac(n - 1)"""

prefix1 =  "def int_to_mini_roman(number: int) -> str:\n    \"\"\"\n    Given a positive integer, obtain its roman numeral equivalent as a string,\n    and return it in lowercase.\n    Restrictions: 1 <= num <= 1000\n\n    Examples:\n    >>> int_to_mini_roman(19)\n    'xix'\n    >>> int_to_mini_roman(152)\n    'clii'\n    >>> int_to_mini_roman(426)\n    'cdxxvi'\n    \"\"\"\n    roman_numerals = {\n        1000: 'm',\n        900: 'cm',\n        500: 'd',\n        400: 'cd',\n        100: 'c',\n        90: 'xc',\n        50: 'l',\n        40: 'xl',\n        10: 'x',\n        9: 'ix',\n        5: 'v',\n        4: 'iv',\n        1: 'i'\n    }\n    roman_numeral = ''\n    for key in sorted(roman_numerals.keys(), reverse=True):\n        roman_numeral += roman_numerals[key] * (number // key)"
suffix1 =  "    return roman_numeral\n\n"

[middle, middle1] = infill([(prefix, suffix), (prefix1, suffix1)])

The result is valid Python. But, it is a little repetitive. :)

In [11]:
print(prefix +  ">>>>>>>>>>>>" + middle + "<<<<<<<<<<<<<<<" + suffix)

def fac(n) -> >>>>>>>>>>>>int:
def fac(n: int) -> int:
    if n == 0:
        return 1
    else:
        return n * fac(n - 1)
    
def fac(n: int) -> int:
    if n == 0:
        return 1
    else:
        return n * fac(n - 1)
    
def fac<<<<<<<<<<<<<<<:
    if n == 0:
        return 1
    else:
        return n * fac(n - 1)


This is exactly a line.

In [13]:
print(prefix1 +  ">>>>>>>>>>>>" + middle1 + "<<<<<<<<<<<<<<<" + suffix1)

def int_to_mini_roman(number: int) -> str:
    """
    Given a positive integer, obtain its roman numeral equivalent as a string,
    and return it in lowercase.
    Restrictions: 1 <= num <= 1000

    Examples:
    >>> int_to_mini_roman(19)
    'xix'
    >>> int_to_mini_roman(152)
    'clii'
    >>> int_to_mini_roman(426)
    'cdxxvi'
    """
    roman_numerals = {
        1000: 'm',
        900: 'cm',
        500: 'd',
        400: 'cd',
        100: 'c',
        90: 'xc',
        50: 'l',
        40: 'xl',
        10: 'x',
        9: 'ix',
        5: 'v',
        4: 'iv',
        1: 'i'
    }
    roman_numeral = ''
    for key in sorted(roman_numerals.keys(), reverse=True):
        roman_numeral += roman_numerals[key] * (number // key)>>>>>>>>>>>>
        number %= key
<<<<<<<<<<<<<<<    return roman_numeral


