# Fine-tune CodeT5 for generating docstrings from Ruby code

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q transformers datasets

In [None]:
!pip install -q pytorch-lightning wandb

In [None]:
from datasets import load_dataset

dataset = load_dataset("code_x_glue_ct_code_to_text", "ruby")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 24927
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 1261
    })
})


In [None]:
example = dataset['train'][0]

print('Code: ' + example['code'])
print('Docstring ' + example['docstring'])

Code: def handle_parsed_websocket_message(json_data)
      data =  json_data.is_a?(Hash) ? json_data.stringify_keys : {}
      if CelluloidPubsub::Reactor::AVAILABLE_ACTIONS.include?(data['client_action'].to_s)
        log_debug "#{self.class} finds actions for  #{json_data}"
        delegate_action(data) if data['client_action'].present?
      else
        handle_unknown_action(data['channel'], json_data)
      end
    end
Docstring method that checks if the data is a Hash

 if the data is a hash then will stringify the keys and will call the method {#delegate_action}
 that will handle the message, otherwise will call the method {#handle_unknown_action}

 @see #delegate_action
 @see #handle_unknown_action

 @param [Hash] json_data

 @return [void]

 @api public


In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

prefix = "Summarize Ruby: "
max_input_length = 512
max_target_length = 512

def preprocess_examples(examples):
  # encode the code-docstring pairs
  codes = examples['code']
  docstrings = examples['docstring']

  inputs = [prefix + code for code in codes]
  model_inputs = tokenizer(inputs, max_length=max_input_length, padding='max_length', truncation=True)

  # encode the summaries
  labels = tokenizer(docstrings, max_length=max_target_length, padding='max_length', truncation=True).input_ids

  # important: we need to replace the index of the padding tokens by -100
  # such that they are not taken into account by the CrossEntropyLoss
  labels_with_ignore_index = []
  for labels_example in labels:
    labels_example = [label if label != 0 else -100 for label in labels_example]
    labels_with_ignore_index.append(labels_example)

  model_inputs["labels"] = labels_with_ignore_index

  return model_inputs

In [None]:
dataset = dataset.map(preprocess_examples, batched=True)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 24927
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1261
    })
})

In [None]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8)
valid_dataloader = DataLoader(dataset['validation'], batch_size=4)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

In [None]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
tokenizer.decode(batch['input_ids'][0])

"<s>Summarize Ruby: def generate_deliver_file(deliver_path, options)\n      v = options[:app].latest_version\n      metadata_path = options[:metadata_path] || File.join(deliver_path,'metadata')\n      generate_metadata_files(v, metadata_path)\n\n      # Generate the final Deliverfile here\n      return File.read(deliverfile_path)\n    end</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [None]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

"<s>This method takes care of creating a new 'deliver' folder, containing the app metadata\n and screenshots folders</s>"

In [None]:
from transformers import T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl

class CodeT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

In [None]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mngaytanthe3579[0m ([33mhieund20052003[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
model = CodeT5()

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

wandb_logger = WandbLogger(name='codet5-finetune-code-summarization-ruby-shuffle', project='CodeT5')
# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(devices=1,
                  default_root_dir="/content/drive/MyDrive/CodeT5/Notebooks/Checkpoints",
                  logger=wandb_logger,
                  callbacks=[early_stop_callback, lr_monitor])
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable par

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
model.model.save_pretrained("/content/drive/MyDrive/GR1/checkpoint/model")
tokenizer.save_pretrained("/content/drive/MyDrive/GR1/checkpoint/model")

('/content/drive/MyDrive/GR1/checkpoint/model/tokenizer_config.json',
 '/content/drive/MyDrive/GR1/checkpoint/model/special_tokens_map.json',
 '/content/drive/MyDrive/GR1/checkpoint/model/vocab.json',
 '/content/drive/MyDrive/GR1/checkpoint/model/merges.txt',
 '/content/drive/MyDrive/GR1/checkpoint/model/added_tokens.json')

# Inference

In [None]:
from datasets import load_dataset

dataset = load_dataset("code_x_glue_ct_code_to_text", "ruby")
print(dataset['test'])

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
    num_rows: 1261
})


In [None]:
test_example = dataset['test'][1]
print("Code:", test_example['code'])

Code: def find_bad_files_from_kubectl_output(line)
      # stderr often contains one or more lines like the following, from which we can extract the file path(s):
      # Error from server (TypeOfError): error when creating "/path/to/service-gqq5oh.yml": Service "web" is invalid:

      line.scan(%r{"(/\S+\.ya?ml\S*)"}).each_with_object([]) do |matches, bad_files|
        matches.each do |path|
          content = File.read(path) if File.file?(path)
          bad_files << { filename: File.basename(path), err: line, content: content }
        end
      end
    end


In [None]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")
finetune_model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/GR1/checkpoint/model")
finetune_tokenizer = RobertaTokenizer.from_pretrained("/content/drive/MyDrive/GR1/checkpoint/model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# prepare for the model
input_ids = finetune_tokenizer('Summarize Ruby: ' +  test_example['code'], return_tensors='pt').input_ids
# generate
outputs = finetune_model.generate(input_ids, max_length=20)
# decode
print("Ground truth:", test_example['docstring'])
print("Generated docstring:", finetune_tokenizer.decode(outputs[0], skip_special_tokens=True))

Ground truth: Inspect the file referenced in the kubectl stderr
 to make it easier for developer to understand what's going on
Generated docstring: Find bad files from the kubeadmectl output


# Eval

In [None]:
%cd /content/drive/MyDrive/GR1/CodeT5-main/code-text-eval

/content/drive/MyDrive/GR1/CodeT5-main/code-text-eval


In [None]:
def postprocessing(text):
    lines = text.splitlines()
    filtered_lines = [line for line in lines if not ('@param' in line or '@return' in line)]
    filtered_text = ' '.join(filtered_lines).replace('\n', ' ').replace('\t', ' ').replace('\n\n', ' ').replace('\t\t', ' ')
    return filtered_text

In [None]:
dataset = load_dataset("code_x_glue_ct_code_to_text", "ruby")
print(dataset['test'])
test_example = dataset['test']

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
    num_rows: 1261
})


In [None]:
with open('predictions.txt', 'w') as pre, open('reference.txt', 'w') as re:
    dataloader = DataLoader(test_example, batch_size=1, shuffle=False)
    length = len(dataloader)
    i = 0
    for batch in dataloader:
        # Prediction
        text = batch['code'][0]
        input_ids = tokenizer(text, return_tensors="pt", max_length=20, truncation=True, padding=True).input_ids
        generated_ids = finetune_model.generate(input_ids, max_length=20)
        result = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        # result = postprocessing(result)
        pre.write(str(i) + '\t' + result + "\n")

        # Reference
        text = batch['docstring'][0].rstrip('\n')
        text= postprocessing(text)
        re.write(str(i) + '\t' + text + "\n")

        i += 1
        if ((i % 100) == 0):
          print(f'{i}/{length}')

100/1261
200/1261
300/1261
400/1261
500/1261
600/1261
700/1261
800/1261
900/1261
1000/1261
1100/1261
1200/1261


In [None]:
!python evaluator.py reference.txt < predictions.txt

Total: 1261
9.600254059282024
