In [1]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

In [2]:
class LoadData(Dataset):
    def __init__(self, file_path, tokenizer, max_lenth = 512):
        self.data = self.load_jsonl(file_path)
        self.tokenizer = tokenizer
        self.max_lenth = max_lenth

    def load_jsonl(self, file_path):
        with open(file_path, 'r', encoding = 'utf-8') as f:
            return [json.loads(line) for line in f]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        code = row['method_code']
        summary = row['method_summary']

        inputs = self.tokenizer(code, max_length = self.max_lenth, padding = 'max_length', truncation = True, return_tensors = 'pt')
        labels = self.tokenizer(summary, padding = 'max_length', truncation = True, return_tensors = 'pt')

        return {
            "input_ids": inputs['input_ids'].squeeze(0),
            "attention_mask": inputs['attention_mask'].squeeze(0),
            "labels": labels['input_ids'].squeeze(0)
        }

In [3]:
from transformers import AutoModelForSeq2SeqLM

In [4]:
model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
from torch.utils.data import DataLoader

In [9]:
file_path = "/content/method-level-mcsn.jsonl"
dataset = LoadData(file_path, tokenizer)
dataloader = DataLoader(dataset, batch_size = 64, shuffle = True)

In [10]:
def generate_summary(batch, model, tokenizer, max_length=512):

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids, attention_mask=attention_mask,
            max_length=max_length, num_beams=5, early_stopping=True
        )

    summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    return summaries

# Run inference on a small batch
batch = next(iter(dataloader))
generated_summaries = generate_summary(batch, model, tokenizer)

In [13]:
import pandas as pd

results_df = pd.DataFrame({
    "Method Code": [dataset.data[i]["method_code"] for i in range(len(generated_summaries))],
    "Original Summary": [dataset.data[i]["method_summary"] for i in range(len(generated_summaries))],
    "Generated Summary": generated_summaries,
})

results_df.head()

Unnamed: 0,Method Code,Original Summary,Generated Summary
0,"def _inv_z(self, z):\n """"""""""""\n with tf....",Reconstruct input `x` from a its normalized ve...,return True\n
1,def semilocal_linear_trend_transition_matrix(a...,Build the transition matrix for a semi-local l...,list_json
2,def semilocal_linear_trend_transition_noise(le...,Build the transition noise model for a semi-lo...,"r""[a-zA-Z_][0-9a-zA-Z_]*|[a-zA-Z][0-9a-zA-Z_]*..."
3,"def _randomize(coeffs, radixes, seed=None):\n ...",Applies the Owen (2017) randomization to the c...,'list_json'
4,"def _base_expansion_size(num, bases):\n """"""""""...",Computes the number of terms in the place valu...,__future__
