# Pretrain

In [None]:
DATAPATH = "/kaggle/input/codesearchnet/ruby/ruby/final/jsonl"

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
!pip install transformers 
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch.nn as nn 
import torch.nn.functional as F
from torch.optim import AdamW
!pip install contractions
import contractions
import os
import pyarrow.parquet as pq
import re
import time
import gc
from tqdm.notebook import tqdm
from itertools import filterfalse
from tqdm import trange
from transformers import AutoTokenizer, BertForPreTraining,BertForMaskedLM
from transformers import T5Tokenizer, T5ForConditionalGeneration, RobertaTokenizer
import torch
import random



  pid, fd = os.forkpty()




In [None]:
def sliding_window(row, col_name, chunk_size=509, overlap=50):
    words = row[col_name]
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        start = i
        end = min(i + chunk_size, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
    return pd.DataFrame({'docstring': chunks})
def expand_contractions(sentence):
    contractions_expanded = [contractions.fix(word) for word in sentence.split()]
    return ' '.join(contractions_expanded)
def lower_case(sentence):
    return ' '.join([word.lower() for word in sentence.split()])
def remove_punctuation(sentence):
    return ' '.join([re.sub(r'[^\w\s]', '', word) for word in sentence.split()])
def preprocess(lst, process=True, min_words=20):
    lst[:] = filterfalse(lambda x: len(x.split()) <= min_words, lst)
    if process == True:
        for i, sent in enumerate(lst):
            # if len(sent.split()) <= min_words:
            #   continue
            lst[i] = lower_case(remove_punctuation(expand_contractions(sent)))
    return lst
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
train_path = DATAPATH + '/train'
json_files = [f for f in os.listdir(train_path) if f.endswith('.jsonl')]
dataframes = []

for file in json_files:
    file_path = os.path.join(train_path, file)
    df = pd.read_json(file_path, lines=True)
    dataframes.append(df)

df_self_sup = pd.concat(dataframes, ignore_index=True)
df_self_sup = pd.concat([sliding_window(row, 'docstring_tokens') for _, row in df_self_sup.iterrows()], ignore_index=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
set_seed(42)
start_time = time.time()
train_sentences = preprocess(list(df_self_sup['docstring']), min_words = 50)
train_df = pd.DataFrame([])
train_df['docstring'] = train_sentences
del train_sentences, df_self_sup
gc.collect()
# Load the pre-trained T5 model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small").to(device)

In [None]:
class MaskedLanguageModelingDataset(Dataset):
    def __init__(self, dataframe, tokenizer, mask_probability=0.15, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.mask_probability = mask_probability
        self.max_length = max_length
        self.special_tokens = self.tokenizer.additional_special_tokens 
    def mask_tokens(self, text):
        tokens = text.split()
        masked_tokens = []
        idx = 0
        i = 0
        while i < len(tokens):
            if random.random() < self.mask_probability:
                # Replace all consecutive masked words with a single special token
                current_special_token = self.special_tokens[idx % len(self.special_tokens)]
                masked_tokens.append(current_special_token)
                while i + 1 < len(tokens) and random.random() < self.mask_probability:
                    i += 1
            else:
                masked_tokens.append(tokens[i])
            i += 1
            idx += 1

        masked_text = " ".join(masked_tokens)
        return masked_text

    def complement_tokens(self, text, masked_indices):
        tokens = text.split()
        complement_tokens = []
        idx = 0
        i = 0
        while i < len(tokens):
            if i in masked_indices:
                # Replace all consecutive masked words with a single special token
                current_special_token = self.special_tokens[idx % len(self.special_tokens)]
                complement_tokens.append(current_special_token)
                while i + 1 < len(tokens) and i + 1 in masked_indices:
                    i += 1
            else:
                complement_tokens.append(tokens[i])
            i += 1
            idx += 1

        complement_text = " ".join(complement_tokens)
        return complement_text
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['docstring']

        # Mask tokens with the given probability
        masked_text = self.mask_tokens(text)

        # Get the indices of masked tokens
        masked_indices = [i for i, token in enumerate(masked_text.split()) if not token.startswith("<extra_id_")]

        # Create complement sentence
        complement_text = self.complement_tokens(text, masked_indices)
  
        # Tokenize the masked text
        input_ids = self.tokenizer(
            masked_text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        ).input_ids

        # Tokenize the complement text
        labels = self.tokenizer(
            complement_text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        ).input_ids

        return {"input_ids": input_ids, "labels": labels}

In [None]:
dataset = MaskedLanguageModelingDataset(train_df, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
epochs = 5
learning_rate = 5e-5

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_accumulation_steps = 32
# Training loop
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    i = 0
    for batch in loop:
        inputs = batch["input_ids"].squeeze(dim=1).to(device)
        labels = batch["labels"].squeeze(dim=1).to(device)
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss/num_accumulation_steps
        loss.backward()
        if (i+1) % num_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
        i+=1
# Save the trained model
model.save_pretrained("codet5_model_base")
tokenizer.save_pretrained("codet5_model_tokenizer_base")

# Eval

In [None]:
tokenizer = T5Tokenizer.from_pretrained('/kaggle/working/unsupervised_t5_model_tokenizer_base')
model = T5ForConditionalGeneration.from_pretrained('/kaggle/working/unsupervised_t5_model_base').to(device)

In [None]:
test_path = DATAPATH + '/test'
test_files = [f for f in os.listdir(test_path) if f.endswith('.jsonl')]
dataframes = []

for file in test_files:
    file_path = os.path.join(test_path, file)
    df = pd.read_json(file_path, lines=True)
    dataframes.append(df)

df_test = pd.concat(dataframes, ignore_index=True)


In [None]:
import json
class JsonlDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r') as f:
            for line in f:
                self.data.append(json.loads(line))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [6]:
def postprocessing(text):
    lines = text.splitlines()
    filtered_lines = [line for line in lines if not ('@param' in line or '@return' in line)]
    filtered_text = ' '.join(filtered_lines).replace('\n', ' ').replace('\t', ' ').replace('\n\n', ' ').replace('\t\t', ' ')
    return filtered_text

In [17]:
# test_example = JsonlDataset('/kaggle/input/codesearchnet/ruby/ruby/final/jsonl/test/ruby_test_0.jsonl')
test_example = dataset['test']
print(test_example)

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1261
})


In [24]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

dataset = load_dataset("code_x_glue_ct_code_to_text", "ruby")
test_example = dataset['test']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with open('/kaggle/working/eval/pretrain_t5/predictions.txt', 'w', encoding="utf8") as pre, open('/kaggle/working/eval/pretrain_t5/reference.txt', 'w', encoding="utf8") as re:
    testloader = DataLoader(test_example, batch_size=1, shuffle=False)
    length = len(dataloader)
    i = 0
    print(test_example)
    for batch in testloader:
        # Prediction
        text = batch['code'][0]
        input_ids = tokenizer(text, return_tensors="pt", max_length=17500, truncation=True, padding=True).input_ids
        input_ids = input_ids.squeeze(dim=1).to(device)
        generated_ids = model.generate(input_ids, max_length=20)
        result = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        result = postprocessing(result)
        pre.write(str(i) + '\t' + result + "\n")

        # Reference
        text = batch['docstring']
        text= postprocessing(text)
        re.write(str(i) + '\t' + text + "\n")

        i += 1
        if ((i % 100) == 0):
          print(f'{i}/{length}')

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
    num_rows: 1261
})


AttributeError: 'CodeT5' object has no attribute 'generate'

# Finetune

In [None]:
!pip install -q transformers datasets

In [1]:
!pip install -q pytorch-lightning wandb

In [2]:
from datasets import load_dataset

dataset = load_dataset("code_x_glue_ct_code_to_text", "ruby")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 24927
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 1400
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 1261
    })
})


In [3]:
example = dataset['train'][0]

print('Code: ' + example['code'])
print('Docstring ' + example['docstring'])

Code: def handle_parsed_websocket_message(json_data)
      data =  json_data.is_a?(Hash) ? json_data.stringify_keys : {}
      if CelluloidPubsub::Reactor::AVAILABLE_ACTIONS.include?(data['client_action'].to_s)
        log_debug "#{self.class} finds actions for  #{json_data}"
        delegate_action(data) if data['client_action'].present?
      else
        handle_unknown_action(data['channel'], json_data)
      end
    end
Docstring method that checks if the data is a Hash

 if the data is a hash then will stringify the keys and will call the method {#delegate_action}
 that will handle the message, otherwise will call the method {#handle_unknown_action}

 @see #delegate_action
 @see #handle_unknown_action

 @param [Hash] json_data

 @return [void]

 @api public


In [4]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

prefix = "Summarize Ruby: "
max_input_length = 1024
max_target_length = 1024

def preprocess_examples(examples):
  # encode the code-docstring pairs
  codes = examples['code']
  docstrings = examples['docstring']

  inputs = [prefix + code for code in codes]
  model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

  # encode the summaries
  labels = tokenizer(docstrings, max_length=max_target_length, padding="max_length", truncation=True).input_ids

  # important: we need to replace the index of the padding tokens by -100
  # such that they are not taken into account by the CrossEntropyLoss
  labels_with_ignore_index = []
  for labels_example in labels:
    labels_example = [label if label != 0 else -100 for label in labels_example]
    labels_with_ignore_index.append(labels_example)

  model_inputs["labels"] = labels_with_ignore_index

  return model_inputs

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [5]:
dataset = dataset.map(preprocess_examples, batched=True)

Map:   0%|          | 0/24927 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1261 [00:00<?, ? examples/s]

In [6]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8)
valid_dataloader = DataLoader(dataset['validation'], batch_size=4)
test_dataloader = DataLoader(dataset['test'], batch_size=4)

In [7]:
from transformers import T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl

class CodeT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=2, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/pretraincodet5/pytorch/model/1")
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

In [8]:
import wandb

wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
model = CodeT5()

In [10]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

wandb.finish()
wandb_logger = WandbLogger(name='codet5-finetune-code-summarization', project='CodeT5')
# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(devices=1,
                  default_root_dir="/kaggle/working/",
                  logger=wandb_logger,
                  callbacks=[early_stop_callback, lr_monitor],
                 max_epochs=3)
trainer.fit(model)

[34m[1mwandb[0m: Currently logged in as: [33mngaytanthe3579[0m ([33mhieund20052003[0m). Use [1m`wandb login --relogin`[0m to force relogin




Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [4]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

In [5]:
dataset = load_dataset("code_x_glue_ct_code_to_text", "ruby")
test_example = dataset['test']

In [6]:
finetune_model = T5ForConditionalGeneration.from_pretrained("codet5_finetune")
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

In [None]:
# prepare for the model
input_ids = tokenizer(test_example['code'], return_tensors='pt', max_length=512, truncation=True, padding=True).input_ids
# generate
outputs = finetune_model.generate(input_ids)
print("Generated docstring:", tokenizer.decode(outputs[0], skip_special_tokens=True))

In [10]:
def postprocessing(text):
    lines = text
    filtered_lines = [line for line in lines if not ('@param' in line or '@return' in line)]
    filtered_text = ' '.join(filtered_lines).replace('\n', ' ').replace('\t', ' ').replace('\n\n', ' ').replace('\t\t', ' ')
    return filtered_text

In [11]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with open('/kaggle/working/eval/pretrain_t5/predictions.txt', 'w', encoding="utf8") as pre, open('/kaggle/working/eval/pretrain_t5/reference.txt', 'w', encoding="utf8") as re:
    testloader = DataLoader(test_example, batch_size=1, shuffle=False)
    length = len(testloader)
    i = 0
    print(test_example)
    for batch in testloader:
        # Prediction
        text = batch['code']
        input_ids = tokenizer(text, return_tensors="pt", max_length=17500, truncation=True, padding=True).input_ids
        input_ids = input_ids.squeeze(dim=1).to(device)
        generated_ids = finetune_model.generate(input_ids, max_length=20)
        result = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        result = postprocessing(result)
        pre.write(str(i) + '\t' + result + "\n")

        # Reference
        text = batch['docstring']
        text= postprocessing(text)
        re.write(str(i) + '\t' + text + "\n")

        i += 1
        if ((i % 100) == 0):
          print(f'{i}/{length}')

Dataset({
    features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
    num_rows: 1261
})
100/1261
200/1261
300/1261
400/1261
500/1261
600/1261
700/1261
800/1261
900/1261
1000/1261
1100/1261
1200/1261
