In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-python")
model = AutoModelForMaskedLM.from_pretrained("neulab/codebert-python")
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
outputs = fill_mask("if (x is not None) <mask> (x > 0)")
for output in outputs:
    print(output)

{'score': 0.875249445438385, 'token': 8, 'token_str': ' and', 'sequence': 'if (x is not None) and (x > 0)'}
{'score': 0.017183667048811913, 'token': 50, 'token_str': ' or', 'sequence': 'if (x is not None) or (x > 0)'}
{'score': 0.013177888467907906, 'token': 463, 'token_str': 'and', 'sequence': 'if (x is not None)and (x > 0)'}
{'score': 0.012697670608758926, 'token': 671, 'token_str': ' return', 'sequence': 'if (x is not None) return (x > 0)'}
{'score': 0.010224265046417713, 'token': 48200, 'token_str': ' &&', 'sequence': 'if (x is not None) && (x > 0)'}


In [3]:
outputs = fill_mask("if var1 <mask> <mask> None:")
for output in outputs:
    for sub_output in output:
        print(sub_output)
    print()

{'score': 0.9669309854507446, 'token': 16, 'token_str': ' is', 'sequence': '<s>if var1 is<mask> None:</s>'}
{'score': 0.00770614156499505, 'token': 328, 'token_str': '!', 'sequence': '<s>if var1!<mask> None:</s>'}
{'score': 0.002733553759753704, 'token': 28696, 'token_str': ' <', 'sequence': '<s>if var1 <<mask> None:</s>'}
{'score': 0.002088442211970687, 'token': 50118, 'token_str': '\n', 'sequence': '<s>if var1\n<mask> None:</s>'}
{'score': 0.0018904786556959152, 'token': 35, 'token_str': ':', 'sequence': '<s>if var1:<mask> None:</s>'}

{'score': 0.9925784468650818, 'token': 45, 'token_str': ' not', 'sequence': '<s>if var1<mask> not None:</s>'}
{'score': 0.001338448142632842, 'token': 5214, 'token_str': '=', 'sequence': '<s>if var1<mask>= None:</s>'}
{'score': 0.0013240614207461476, 'token': 16, 'token_str': ' is', 'sequence': '<s>if var1<mask> is None:</s>'}
{'score': 0.0009433833765797317, 'token': 49333, 'token_str': '!=', 'sequence': '<s>if var1<mask>!= None:</s>'}
{'score': 0.000

In [4]:
outputs = fill_mask("<mask> ( x )")
for output in outputs:
    print(output)

{'score': 0.1390775889158249, 'token': 17265, 'token_str': 'print', 'sequence': 'print ( x )'}
{'score': 0.09198562055826187, 'token': 5780, 'token_str': ' print', 'sequence': ' print ( x )'}
{'score': 0.03168381378054619, 'token': 41975, 'token_str': 'import', 'sequence': 'import ( x )'}
{'score': 0.01466455589979887, 'token': 1423, 'token_str': ' y', 'sequence': ' y ( x )'}
{'score': 0.013982338830828667, 'token': 37131, 'token_str': ' eval', 'sequence': ' eval ( x )'}


In [5]:
from collections import defaultdict
from tokenize import generate_tokens

def get_best_output(outputs):
    probs = defaultdict(float)
    for output in outputs:
        probs[output["token_str"].strip()] += output["score"]
    return max(probs.items(), key=lambda x: x[1])

In [6]:
from collections import defaultdict

text = """if (x is not None) or (x > 0):
    return x
"""

line = 0  # center of attention
lines = [x + "\n" for x in text.split("\n")]
N = len(lines)

tokens = list(generate_tokens(lambda L=iter(lines): next(L)))
filtered_tokens = defaultdict(list)
for token in tokens:
    filtered_tokens[token.start[0] - 1].append(token)
line_lengths = [filtered_tokens[i][-1].end[1] for i in range(N)]
cumulative_lengths = [sum(line_lengths[:i]) for i in range(N)]
   
curr_tokens = filtered_tokens[line]
curr_token_strings = [x.string for x in curr_tokens]
print(" ".join(curr_token_strings).strip())

suggestions = []
for i in range(len(curr_tokens)):
    prev = curr_token_strings[i]
    curr_token_strings[i] = "<mask>"
    string = " ".join(curr_token_strings).strip()
    curr_token_strings[i] = prev
    outputs = fill_mask(string)
    best_output, best_prob = get_best_output(outputs)
    if best_output.strip() != prev.strip() and best_prob > 0.8:
        print("CHANGE {", prev, "} to {", best_output, "}")
        start = cumulative_lengths[curr_tokens[i].start[0] - 1] + curr_tokens[i].start[1]
        end = cumulative_lengths[curr_tokens[i].end[0] - 1] + curr_tokens[i].end[1]
        suggestions.append(((prev, start, end), best_output))
        
print("DONE")

if ( x is not None ) or ( x > 0 ) :
CHANGE { or } to { and }
DONE
