## Enviorment and load model and dataset

In [None]:
%%capture
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install accelerate -U
!pip install codebleu

import datasets
from datasets import Dataset
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModel, AdamW,
    AutoModelForCausalLM, Trainer, TrainingArguments, TrainerCallback, default_data_collator)
import evaluate
import numpy as np
from google.colab import drive, files
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import os
import dataclasses
from dataclasses import dataclass, field
from tqdm import tqdm

from typing import Dict, List, Optional
import torch
from transformers import AutoTokenizer
from datasets import load_metric
from codebleu import calc_codebleu
import pandas as pd
from datasets import Dataset, concatenate_datasets
from transformers import EarlyStoppingCallback

In [None]:
# Mount Google Drive to access the files
drive.mount('/content/drive')

# Define the path to save the CSV files
path = '/content/drive/My Drive/dsml/1_project/Model'
os.chdir(path)

Mounted at /content/drive


## Train codeT5

In [None]:
tokenizer_t5b = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
    A dictionary of tensors
    """

    input_ids = torch.stack([example['input_ids'] for example in batch])
    lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
    lm_labels[lm_labels[:, :] == 0] = -100
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': lm_labels,
        'decoder_attention_mask': decoder_attention_mask
    }

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

### char replacement

In [None]:
model_t5b_char_3= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

char_3_1 = []
char_3_2 = []
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/third'
os.chdir(path_dataset)

with open('char_0.3_2_f.json', 'r') as infile:
    for line in infile:
        char_3_1.append(json.loads(line))

with open('char_0.3_2.json', 'r') as infile:
    for line in infile:
        char_3_2.append(json.loads(line))


df = pd.DataFrame(char_3_1+char_3_2)
char_3dataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_char_3 = concatenate_datasets([train_dataset, char_3dataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_char_3.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_char_3 = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_char_3 = tokenized_dataset_t_char_3
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_char_3.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.61k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/570k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/27789 [00:00<?, ? examples/s]

Map:   0%|          | 0/27789 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_char_3_0.3",
    logging_dir="codet5_char_3_0.3",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=4,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b_char_3,
    args=training_args,
    train_dataset=train_dataset_char_3,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
1000,0.7321,0.60938
2000,0.5996,0.554284


### GPT3

In [None]:
model_t5b_gpt3= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

gpt3_1 = []
gpt3_2 = []
gpt3_3 = []
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/third'
os.chdir(path_dataset)

with open('gpt3_zero_1.json', 'r') as infile:
    for line in infile:
        gpt3_1.append(json.loads(line))

with open('gpt3_zero_2.json', 'r') as infile:
    for line in infile:
        gpt3_2.append(json.loads(line))

with open('gpt3_zero_3.json', 'r') as infile:
    for line in infile:
        gpt3_3.append(json.loads(line))

df = pd.DataFrame(gpt3_1+gpt3_2+gpt3_3)
gpt3dataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_gpt3 = concatenate_datasets([train_dataset, gpt3dataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_gpt3.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_gpt3 = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_gpt3 = tokenized_dataset_t_gpt3
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_gpt3.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/18520 [00:00<?, ? examples/s]

Map:   0%|          | 0/18520 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_gpt3",
    logging_dir="codet5_gpt3",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=500,
    save_steps=500,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=4,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b_gpt3,
    args=training_args,
    train_dataset=train_dataset_gpt3,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
500,1.0764,0.678639
1000,0.7435,0.614292
1500,0.659,0.578875
2000,0.6135,0.557478
2500,0.5777,0.54502


Step,Training Loss,Validation Loss
500,1.0764,0.678639
1000,0.7435,0.614292
1500,0.659,0.578875
2000,0.6135,0.557478
2500,0.5777,0.54502
3000,0.5438,0.532948
3500,0.5207,0.529131
4000,0.4869,0.525273
4500,0.4776,0.517351
5000,0.4467,0.518535


TrainOutput(global_step=7500, training_loss=0.5452404907226562, metrics={'train_runtime': 4955.889, 'train_samples_per_second': 56.055, 'train_steps_per_second': 3.505, 'total_flos': 5.7066966319104e+16, 'train_loss': 0.5452404907226562, 'epoch': 6.48})

### textattack

In [None]:
model_t5b_gpt3= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

gpt3_1 = []
gpt3_2 = []
gpt3_3 = []
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/third'
os.chdir(path_dataset)

with open('gpt3_zero_1.json', 'r') as infile:
    for line in infile:
        gpt3_1.append(json.loads(line))

with open('gpt3_zero_2.json', 'r') as infile:
    for line in infile:
        gpt3_2.append(json.loads(line))

with open('gpt3_zero_3.json', 'r') as infile:
    for line in infile:
        gpt3_3.append(json.loads(line))

df = pd.DataFrame(gpt3_1+gpt3_2+gpt3_3)
gpt3dataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_gpt3 = concatenate_datasets([train_dataset, gpt3dataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_gpt3.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_gpt3 = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_gpt3 = tokenized_dataset_t_gpt3
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_gpt3.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

### Baseline

In [None]:
model_t5b = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

In [None]:
train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

max_input_length =  400
max_target_length = 400

half_length_train = len(train_dataset) // 2
first_half_train_dataset = train_dataset.select(list(range(half_length_train)))

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example
# training half dataset
t_train_dataset  = first_half_train_dataset.map(add_eos_examples) # train_dataset.map(add_eos_examples)
t_train_dataset  = t_train_dataset.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t = t_train_dataset.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset = tokenized_dataset_t
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/4631 [00:00<?, ? examples/s]

Map:   0%|          | 0/4631 [00:00<?, ? examples/s]



In [None]:
# training_args = TrainingArguments(
#     output_dir="codet5_baseline",
#     logging_dir="codet5_baseline-logging",
#     do_train=True,
#     do_eval=True,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     # gradient_accumulation_steps=4,
#     weight_decay=0.01,
#     learning_rate=5e-5,
#     num_train_epochs=10,  # Change to match bash script
#     logging_steps=500,
#     evaluation_strategy="steps",
#     logging_strategy="steps",
#     optim="adamw_torch",
#     gradient_checkpointing=True,
#     eval_accumulation_steps=10,
#     fp16=True,
#     eval_steps=500,
#     seed=42
# )

# # logger = logging.getLogger(__name__)

# trainer = Trainer(
#     model=model_t5b,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     data_collator=T2TDataCollator(),
#     tokenizer = tokenizer_t5b,
# )

# trainer.train()

# trainer.save_model()
# tokenizer_t5b.save_pretrained("codet5_baseline")

training_args = TrainingArguments(
    output_dir="codet5_baseline",
    logging_dir="codet5_baseline-logging",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

KeyboardInterrupt: ignored

#### Half dataset

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_baseline_half",
    logging_dir="codet5_baseline_half",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=500,
    save_steps=500,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
500,1.0481,0.673195
1000,0.681,0.622268
1500,0.5958,0.59884
2000,0.5338,0.588855
2500,0.4891,0.586864
3000,0.4598,0.585923
3500,0.4347,0.583206
4000,0.418,0.584544


TrainOutput(global_step=4350, training_loss=0.5686957839439655, metrics={'train_runtime': 3008.5209, 'train_samples_per_second': 23.089, 'train_steps_per_second': 1.446, 'total_flos': 3.304785927168e+16, 'train_loss': 0.5686957839439655, 'epoch': 15.0})

###bert_synonym replacement

In [None]:
model_t5b_bertsr= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.61k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/570k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
import json

bertsr_train_data_1 = []
bertsr_train_data_2 = []
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/second'
os.chdir(path_dataset)

with open('1_bert_sr_n1.json', 'r') as infile:
    for line in infile:
        bertsr_train_data_1.append(json.loads(line))

with open('1_bert_sr_n2.json', 'r') as infile:
    for line in infile:
        bertsr_train_data_2.append(json.loads(line))

df = pd.DataFrame(bertsr_train_data_1+bertsr_train_data_2)
bertsr_dataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_bertsr = concatenate_datasets([train_dataset, bertsr_dataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_2'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_bertsr.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_bertsr = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_bertsr = tokenized_dataset_t_bertsr
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_bertsr.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)xi

Map:   0%|          | 0/37052 [00:00<?, ? examples/s]

Map:   0%|          | 0/37052 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_bertsr_n3",
    logging_dir="codet5_bertsr-logging_n3",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=10,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=4,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b_bertsr,
    args=training_args,
    train_dataset=train_dataset_bertsr,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
1000,0.7562,0.620769
2000,0.6413,0.564704
3000,0.5563,0.541508
4000,0.5209,0.532378
5000,0.4718,0.529094
6000,0.4474,0.520201
7000,0.4189,0.523919
8000,0.3863,0.525539


KeyboardInterrupt: ignored

###word2vec_synonym replacement

In [None]:
model_t5b_w2vsr= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

w2vsrtrain_data_1 = []
w2vsrtrain_data_2 = []
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/second'
os.chdir(path_dataset)

with open('2_word2vec_sr_0.3_1.json', 'r') as infile:
    for line in infile:
        w2vsrtrain_data_1.append(json.loads(line))

with open('2_word2vec_sr_0.3_2.json', 'r') as infile:
    for line in infile:
        w2vsrtrain_data_2.append(json.loads(line))

df = pd.DataFrame(w2vsrtrain_data_1+w2vsrtrain_data_2)
w2vsrdataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_w2vsr = concatenate_datasets([train_dataset, w2vsrdataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_2'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_w2vsr.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_w2vsr = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_w2vsr = tokenized_dataset_t_w2vsr
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_w2vsr.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.61k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading and preparing dataset xlcost-text-to-code/Python-program-level to /root/.cache/huggingface/datasets/codeparrot___xlcost-text-to-code/Python-program-level/2.1.0/ffae7d034dfaa9e215012bcf52b8690f3ae22d9c52f45fe2ffd3dcf4093d9f2c...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/570k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset xlcost-text-to-code downloaded and prepared to /root/.cache/huggingface/datasets/codeparrot___xlcost-text-to-code/Python-program-level/2.1.0/ffae7d034dfaa9e215012bcf52b8690f3ae22d9c52f45fe2ffd3dcf4093d9f2c. Subsequent calls will reuse this data.




Map:   0%|          | 0/37052 [00:00<?, ? examples/s]

Map:   0%|          | 0/37052 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_w2vsr_n3",
    logging_dir="codet5_w2vsr-logging_n3",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=10,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=4,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b_w2vsr,
    args=training_args,
    train_dataset=train_dataset_w2vsr,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
1000,0.7321,0.614037
2000,0.6114,0.556994
3000,0.5235,0.537819
4000,0.485,0.529336
5000,0.4342,0.527679
6000,0.4078,0.520751
7000,0.3773,0.528945
8000,0.3441,0.53269
9000,0.3294,0.535445


TrainOutput(global_step=9000, training_loss=0.5004361317952474, metrics={'train_runtime': 5853.8464, 'train_samples_per_second': 63.295, 'train_steps_per_second': 3.956, 'total_flos': 6.8502053707776e+16, 'train_loss': 0.5004361317952474, 'epoch': 3.89})

### t5_back translation

In [None]:
model_t5b_t5bt= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

t5bttrain_data_f = [] # french
t5bttrain_data_g = [] # german
t5bttrain_data_s = [] # spanish
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/second'
os.chdir(path_dataset)

with open('3_hels_btg.json', 'r') as infile:
    for line in infile:
        t5bttrain_data_g.append(json.loads(line))

with open('3_hels_btf.json', 'r') as infile:
    for line in infile:
        t5bttrain_data_f.append(json.loads(line))

with open('3_hels_bts.json', 'r') as infile:
    for line in infile:
        t5bttrain_data_s.append(json.loads(line))

df = pd.DataFrame(t5bttrain_data_f+t5bttrain_data_g)
t5btdataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_t5bt = concatenate_datasets([train_dataset, t5btdataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_2'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_t5bt.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_t5bt = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_t5bt = tokenized_dataset_t_t5bt
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_t5bt.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/27789 [00:00<?, ? examples/s]

Map:   0%|          | 0/27789 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

####Half dataset

In [None]:
model_t5b_t5bt= T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

t5bttrain_data_g = [] # german
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/second'
os.chdir(path_dataset)

with open('3_hels_btg.json', 'r') as infile:
    for line in infile:
        t5bttrain_data_g.append(json.loads(line))

df = pd.DataFrame(t5bttrain_data_g)
t5btdataset = Dataset.from_pandas(df)
# concatenate
# Split the datasets
half_length_t5bt = len(t5btdataset) // 2

first_half_t5btdataset = t5btdataset.select(list(range(half_length_t5bt)))

# Concatenate the halves
combined_dataset_t5bt = concatenate_datasets([train_dataset, first_half_t5btdataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_t5bt.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_t5bt = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_t5bt = tokenized_dataset_t_t5bt
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_t5bt.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.61k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_backt_gf",
    logging_dir="codet5_backt_gf",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b_t5bt,
    args=training_args,
    train_dataset=train_dataset_t5bt,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
1000,0.7369,0.611998
2000,0.6033,0.555915
3000,0.5367,0.532772
4000,0.4729,0.523716
5000,0.4458,0.515935
6000,0.3989,0.521901
7000,0.3791,0.53027
8000,0.3421,0.533037


TrainOutput(global_step=8000, training_loss=0.521984519958496, metrics={'train_runtime': 5362.0007, 'train_samples_per_second': 77.739, 'train_steps_per_second': 4.859, 'total_flos': 6.0890080075776e+16, 'train_loss': 0.521984519958496, 'epoch': 4.61})

### Randomaug

In [None]:
model_t5b_Randomaug = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

In [None]:
import json

ranaug_train_data = []
# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/second'
os.chdir(path_dataset)

with open('4_ranaug_0.5_3.json', 'r') as infile:
    for i, line in enumerate(infile):
        ranaug_train_data.append(json.loads(line))
# select each index per three

df = pd.DataFrame(ranaug_train_data)
ranaug_dataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_ranaug = concatenate_datasets([train_dataset, ranaug_dataset])
print(len(combined_dataset_ranaug))

37052


In [None]:
max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_2'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer_t5b.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer_t5b.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer_t5b.sep_token
  example['text'] = example['text'] + tokenizer_t5b.sep_token

  return example

t_train_dataset_4  = combined_dataset_ranaug.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

Map:   0%|          | 0/37052 [00:00<?, ? examples/s]

Map:   0%|          | 0/37052 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset_t_randug = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_randug = tokenized_dataset_t_randug
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_randug.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_ranaug_n3_0.5",
    logging_dir="codet5_ranaug-logging_n3_0.5",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=7,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5b_Randomaug,
    args=training_args,
    train_dataset=train_dataset_randug,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer_t5b,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
1000,0.7271,0.61325
2000,0.6076,0.556563
3000,0.5246,0.533651
4000,0.488,0.525643
5000,0.4388,0.528408
6000,0.4108,0.520968
7000,0.3856,0.525325
8000,0.3554,0.533485


KeyboardInterrupt: ignored

# Train codet5 large

### Baseline

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-large")

@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
    A dictionary of tensors
    """

    input_ids = torch.stack([example['input_ids'] for example in batch])
    lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
    lm_labels[lm_labels[:, :] == 0] = -100
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': lm_labels,
        'decoder_attention_mask': decoder_attention_mask
    }

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

In [None]:
train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

max_input_length =  400
max_target_length = 400

# half_length_train = len(train_dataset) // 2
# first_half_train_dataset = train_dataset.select(list(range(half_length_train)))

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer.sep_token
  example['text'] = example['text'] + tokenizer.sep_token

  return example
# training half dataset
t_train_dataset  = train_dataset.map(add_eos_examples) # train_dataset.map(add_eos_examples)
t_train_dataset  = t_train_dataset.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t = t_train_dataset.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset = tokenized_dataset_t
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/9263 [00:00<?, ? examples/s]

Map:   0%|          | 0/9263 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="codet5_baseline_large",
    logging_dir="codet5_baseline_large",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss


OutOfMemoryError: ignored

### Backtranslation

In [None]:
model_t5l_t5bt= AutoTokenizer.from_pretrained("Salesforce/codet5-large")
train_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='train')
vali_dataset = load_dataset('codeparrot/xlcost-text-to-code', "Python-program-level", split='validation')

import json

t5bttrain_data_g = [] # german

# Define the path to save the CSV files
path_dataset = '/content/drive/My Drive/dsml/1_project/Dataset/second'
os.chdir(path_dataset)

with open('3_hels_btg.json', 'r') as infile:
    for line in infile:
        t5bttrain_data_g.append(json.loads(line))

df = pd.DataFrame(t5bttrain_data_g)
t5btdataset = Dataset.from_pandas(df)
# concatenate
combined_dataset_t5bt = concatenate_datasets([train_dataset, t5btdataset])

max_input_length =  400
max_target_length = 400

t5_path = '/content/drive/My Drive/dsml/1_project/Model/codet5_3'
os.chdir(t5_path)
# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer.batch_encode_plus(example_batch['text'],
                                                  max_length=max_input_length,
                                                  add_special_tokens=True,
                                                  truncation=True,
                                                  pad_to_max_length=True)

    target_encodings = tokenizer.batch_encode_plus(example_batch['code'],
                                                   max_length=max_target_length,
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

def add_eos_examples(example):
  example['code'] = example['code'] + tokenizer.sep_token
  example['text'] = example['text'] + tokenizer.sep_token

  return example

t_train_dataset_4  = combined_dataset_t5bt.map(add_eos_examples)
t_train_dataset_4  = t_train_dataset_4.map(convert_to_features,  batched=True)

t_test_dataset  = vali_dataset.map(add_eos_examples)
t_test_dataset  = t_test_dataset.map(convert_to_features,  batched=True)

tokenized_dataset_t_t5bt = t_train_dataset_4.remove_columns(["text", "code"])
tokenized_dataset_t_2 = t_test_dataset.remove_columns(["text", "code"])

train_dataset_t5bt = tokenized_dataset_t_t5bt
valid_dataset = tokenized_dataset_t_2

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']

train_dataset_t5bt.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/18526 [00:00<?, ? examples/s]

Map:   0%|          | 0/18526 [00:00<?, ? examples/s]



In [None]:
training_args = TrainingArguments(
    output_dir="codet5_btg_large",
    logging_dir="codet5_btg_large",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=15,
    logging_steps=500,
    evaluation_strategy="steps",
    logging_strategy="steps",
    optim="adamw_torch",
    gradient_checkpointing=True,
    eval_accumulation_steps=10,
    fp16=True,
    eval_steps=1000,
    save_steps=1000,
    seed=42,
    # save_total_limit=3,
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # This means a lower eval_loss is better.
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model_t5l_t5bt,
    args=training_args,
    train_dataset=train_dataset_t5bt,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator(),
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

trainer.train()

Step,Training Loss,Validation Loss
