# **GraphCodeBERT Experiments**

1. [X] load GraphCodeBERT model
2. [X] run inference / get embeddings
3. [X] inverse embeddings
4. [X] edit embeddings
5. [x] inverse edited embeddings

## References

* https://huggingface.co/microsoft/graphcodebert-base


## Environment

In [1]:
!python --version

Python 3.7.10


In [2]:
!pip install tokenizers
!pip install transformers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 6.0MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.2
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 4.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 40.3MB/s 
Installing collected packages: sacremoses, transformers
Successfully installed sacremoses-0.0.45 transformers-4.5.1


## Dependencies

In [3]:
import torch
import numpy as np
from scipy.special import softmax
from transformers import pipeline
from transformers import RobertaTokenizer, RobertaForMaskedLM

## Settings

In [4]:
model_name = 'microsoft/graphcodebert-base'
device = torch.device('cuda:0')

## 1. Load GraphCodeBERT model

In [5]:
%time model = RobertaForMaskedLM.from_pretrained(model_name)
%time tokenizer = RobertaTokenizer.from_pretrained(model_name)

model.to(device)
model.eval()

%time fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=630.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498845934.0, style=ProgressStyle(descri…


CPU times: user 17.2 s, sys: 2.38 s, total: 19.6 s
Wall time: 21.9 s


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…


CPU times: user 484 ms, sys: 65.8 ms, total: 550 ms
Wall time: 7.79 s
CPU times: user 84 µs, sys: 15 µs, total: 99 µs
Wall time: 103 µs


## 2. Run inference

In [6]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f0637d72050>

In [7]:
text1 = 'x = a + b'
text2 = 'x = a - b'
texts = [text1, text2]

In [8]:
tokens = {}

for text in texts:
  tokens_pt = tokenizer(text, return_tensors='pt')
  tokens[text] = tokens_pt

for text, tokens_pt in tokens.items():
  print(f'text: >{text}<')
  for key, value in tokens_pt.items():
    print(f'\t{key}: {value}')
    if key == 'input_ids':
      print(f'\t\ttokens (str): {[tokenizer.convert_ids_to_tokens(s) for s in value]}')
      print(f'\t\t#decoding: {[tokenizer.decode(v) for v in value]}')

text: >x = a + b<
	input_ids: tensor([[   0, 1178, 5457,   10, 2055,  741,    2]])
		tokens (str): [['<s>', 'x', 'Ġ=', 'Ġa', 'Ġ+', 'Ġb', '</s>']]
		#decoding: ['<s>x = a + b</s>']
	attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1]])
text: >x = a - b<
	input_ids: tensor([[   0, 1178, 5457,   10,  111,  741,    2]])
		tokens (str): [['<s>', 'x', 'Ġ=', 'Ġa', 'Ġ-', 'Ġb', '</s>']]
		#decoding: ['<s>x = a - b</s>']
	attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1]])


In [9]:
tokens_pt

{'input_ids': tensor([[   0, 1178, 5457,   10,  111,  741,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [11]:
embeddings = {}
for text, tokens_pt in tokens.items():

  #%time output = fill_mask.model.roberta(**tokens_pt)
  %time output = fill_mask.model.roberta(input_ids=tokens_pt['input_ids'].to(device), attention_mask=tokens_pt['attention_mask'].to(device))

  last_hidden_state = output.last_hidden_state
  print(last_hidden_state.shape)

  embeddings[text] = last_hidden_state

CPU times: user 15.9 ms, sys: 0 ns, total: 15.9 ms
Wall time: 15.9 ms
torch.Size([1, 7, 768])
CPU times: user 16.2 ms, sys: 2.16 ms, total: 18.4 ms
Wall time: 16.9 ms
torch.Size([1, 7, 768])


## 4. Inverse embeddings

In [12]:
for text, embedding in embeddings.items():
  print(text)
  lm_head_output = fill_mask.model.lm_head(embedding)

  for i in range(lm_head_output.shape[1]):
    probs = softmax(lm_head_output[0][i].detach().cpu().numpy())
    indices = probs.argsort()[-10:][::-1]
    print([(tokenizer.decode(int(idx)), round(probs[idx], 3)) for idx in indices])

x = a + b
[(' )', 0.193), (' a', 0.144), (' +', 0.111), (' b', 0.069), ('.', 0.052), (' :', 0.034), (' c', 0.032), (' >', 0.028), (' add', 0.024), (' x', 0.023)]
[('x', 1.0), (' x', 0.0), ('xy', 0.0), ('y', 0.0), ('xc', 0.0), ('p', 0.0), ('m', 0.0), ('b', 0.0), ('xa', 0.0), ('w', 0.0)]
[(' =', 1.0), (' +=', 0.0), (' +', 0.0), (' ==', 0.0), (' :', 0.0), (' [', 0.0), ('=', 0.0), (' as', 0.0), (' *', 0.0), ('.', 0.0)]
[(' a', 1.0), (' b', 0.0), (' ax', 0.0), (' x', 0.0), (' A', 0.0), (' ab', 0.0), (' an', 0.0), ('a', 0.0), (' e', 0.0), (' à', 0.0)]
[(' +', 1.0), (' plus', 0.0), (' *', 0.0), (' =', 0.0), (' +=', 0.0), ('Plus', 0.0), (' :', 0.0), ('+', 0.0), (' Plus', 0.0), ('.', 0.0)]
[(' b', 1.0), (' B', 0.0), (' a', 0.0), ('b', 0.0), (' y', 0.0), (' ab', 0.0), (' c', 0.0), (' 1', 0.0), (' bar', 0.0), (' 2', 0.0)]
[(' )', 0.194), (' a', 0.144), (' +', 0.111), (' b', 0.069), ('.', 0.052), (' :', 0.034), (' c', 0.032), (' >', 0.028), (' add', 0.024), (' x', 0.023)]
x = a - b
[(' -', 0.197),

## 5. Edit embeddings

In [13]:
plus_embeddings = embeddings[text1]
minus_embeddings = embeddings[text2]

new_embeddings = 0.5*(plus_embeddings + minus_embeddings)

## 6. Inverse edited embeddings

In [14]:
lm_head_output = fill_mask.model.lm_head(new_embeddings)

for i in range(lm_head_output.shape[1]):
  probs = softmax(lm_head_output[0][i].detach().cpu().numpy())
  indices = probs.argsort()[-10:][::-1]
  print([(tokenizer.decode(int(idx)), round(probs[idx], 3)) for idx in indices])

[(' )', 0.243), (' >', 0.072), (' a', 0.069), (' :', 0.065), ('.', 0.053), (' +', 0.052), (' b', 0.048), (' ]', 0.031), (' c', 0.029), (' }', 0.026)]
[('x', 1.0), (' x', 0.0), ('y', 0.0), ('xc', 0.0), ('xy', 0.0), ('m', 0.0), ('b', 0.0), ('p', 0.0), ('w', 0.0), ('xa', 0.0)]
[(' =', 1.0), (' :', 0.0), (' +', 0.0), (' +=', 0.0), (' [', 0.0), (' ==', 0.0), (' <', 0.0), (' -', 0.0), ('=', 0.0), (' *', 0.0)]
[(' a', 1.0), (' b', 0.0), (' ax', 0.0), (' x', 0.0), (' A', 0.0), (' ab', 0.0), (' an', 0.0), (' à', 0.0), ('a', 0.0), (' y', 0.0)]
[(' +', 0.992), (' -', 0.008), (' *', 0.0), (' =', 0.0), (' ^', 0.0), ('.', 0.0), (' :', 0.0), (' ||', 0.0), (' /', 0.0), (' ;', 0.0)]
[(' b', 1.0), (' a', 0.0), (' B', 0.0), ('b', 0.0), (' y', 0.0), (' 1', 0.0), (' ab', 0.0), (' c', 0.0), (' bar', 0.0), (' 2', 0.0)]
[(' )', 0.243), (' >', 0.072), (' a', 0.069), (' :', 0.065), ('.', 0.053), (' +', 0.052), (' b', 0.048), (' ]', 0.031), (' c', 0.029), (' }', 0.026)]
