In [1]:
# Hugging Face - Fine-Tuning CodeT5 for Code Translation (AI4SE Focus)

# This notebook demonstrates how to fine-tune the CodeT5 model using Hugging Face Transformers
# for a Software Engineering task: translating Python code to Java.

# ------------------------
# 1. Install Required Libraries
# ------------------------
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install transformers datasets evaluate -q

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.5.1
  Downloading https://download.pytorch.org/whl/cu124/torch-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl (908.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.20.1
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.20.1%2Bcu124-cp311-cp311-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m104.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.5.1
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu

In [2]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict, Dataset
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

from datasets import load_dataset
import warnings
import pandas as pd

In [3]:
# ------------------------------------------------------------------------
# 2. Load Dataset (CodeXGLUE - Code Translation Java <=> C#)
# ------------------------------------------------------------------------
import re
warnings.simplefilter(action='ignore', category=FutureWarning)

# CodeXGLUE is a benchmark dataset collection by Microsoft for code-related tasks.
# Here, we use the code-translation-python-java dataset.
#dataset = load_dataset("google/code_x_glue_cc_code_to_code_trans")
train = pd.read_csv('ft_train.csv')
val = pd.read_csv('ft_valid.csv')
test = pd.read_csv('ft_test.csv')
train = train[['cleaned_method', 'target_block']]
val = val[['cleaned_method', 'target_block']]
test = test[['cleaned_method', 'target_block']]

def create_mask(df):
  df['masked_method'] = df['cleaned_method']
  for index, row in df.iterrows():
    df['cleaned_method'][index] = " ".join(row['cleaned_method'].split())
    df['masked_method'][index] = re.compile(re.escape(df['target_block'][index]).replace(r'\ ', r'\s*')).sub("<IF-STMT>", df['cleaned_method'][index], count=1)
  return df

train = create_mask(train)
val = create_mask(val)
test = create_mask(test)
#train = train[:2000]
#val = val[:100]
#test = test[:100]

train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
#Flatten cleaned_method
#Copy cleaned_method into masked_method
#Mask masked_method

✅ This following loads a pre-trained models & tokenizer from Hugging Face using the checkpoint name (e.g., "Salesforce/codet5-small").


*  The tokenizer knows how to convert text into tokens that the model

*   It also handles things like padding, truncation, special tokens, etc.

*	It comes with a fixed vocabulary learned during pretraining, that however we can expand if needed as shown

In [4]:
# ------------------------------------------------------------------------
# 3. Load Pre-trained Model & Tokenizer
# ------------------------------------------------------------------------
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model_checkpoint = "Salesforce/codet5-small"

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(["<IF-STMT>"]) #Imagine we need an extra token. This line adds the extra token to the vocabulary

model.resize_token_embeddings(len(tokenizer))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32101, 512)

⚠️⚠️⚠️ If you add new tokens like this, you must also resize the model’s embedding layer: model.resize_token_embeddings(len(tokenizer))

Otherwise, the model won’t know what to do with the new token IDs!


In [5]:
# ------------------------------------------------------------------------------------------------
# 4. We prepare now the fine-tuning dataset using the tokenizer we preloaded
# ------------------------------------------------------------------------------------------------

def preprocess_function(examples):
    inputs = examples["masked_method"]
    targets = examples["target_block"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
tokenized_datasets = dataset.map(preprocess_function, batched=True)
#tokenized_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
#tokenized_datasets

In [11]:
# ------------------------------------------------------------------------
# 5. Define Training Arguments and Trainer
# ------------------------------------------------------------------------


training_args = TrainingArguments(
    output_dir="./codet5-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2,
    num_train_epochs=7,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [12]:
# ------------------------
# 6. Train the Model
# ------------------------
trainer.train()

# ------------------------
# 7. Evaluate on Test Set
# ------------------------
metrics = trainer.evaluate(tokenized_datasets["test"])

trainer.save_model("t5_Model")

print("Test Evaluation Metrics:", metrics)

# ------------------------
# 8. Test Code Translation
# ------------------------
#input_code = "def add(a, b):\n    return a + b"
#inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)
#outputs = model.generate(**inputs, max_length=256)
#print("Generated Java Code:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

Epoch,Training Loss,Validation Loss
1,0.0398,0.034048
2,0.0333,0.03345
3,0.0294,0.033104
4,0.0268,0.032919
5,0.0256,0.032972
6,0.0238,0.033294


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Test Evaluation Metrics: {'eval_loss': 0.03500358387827873, 'eval_runtime': 49.9342, 'eval_samples_per_second': 100.132, 'eval_steps_per_second': 50.066, 'epoch': 6.0}


In [14]:
trainer.save_model("t5_Model")

In [15]:
trainer = AutoModelForSeq2SeqLM.from_pretrained("t5_Model")

In [16]:
! pip install transformers
!pip install tree_sitter==0.2.0
! git clone -q https://github.com/microsoft/CodeXGLUE.git

Collecting tree_sitter==0.2.0
  Downloading tree_sitter-0.2.0.tar.gz (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.4/110.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tree_sitter
  Building wheel for tree_sitter (setup.py) ... [?25l[?25hdone
  Created wheel for tree_sitter: filename=tree_sitter-0.2.0-cp311-cp311-linux_x86_64.whl size=389535 sha256=1b12f19ce636f3189d4fc20497c5c0ab2fe6683fad8078cf0dc75eadac145253
  Stored in directory: /root/.cache/pip/wheels/d9/6e/e2/b0126ad4f531cf09749b69518118f0ebf7bf3134ed91c71abb
Successfully built tree_sitter
Installing collected packages: tree_sitter
Successfully installed tree_sitter-0.2.0


In [17]:
output_file = pd.DataFrame(columns=['Masked Method', 'Exact Match', 'Expected if Condition', 'Predicted if Condition', 'CodeBLEU Score', 'Bleu4 Score'])#'Masked Method', 'Exact Match', 'Expected if Statement', 'Predicted if Statement', 'CodeBLEU', 'BLEU-4'])

for i in range(len(test)):
  p = open("predicted.txt", "w")
  a = open("actual.txt", "w")
  input_code = test['masked_method'][i]
  inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)
  inputs = inputs.to(model.device)
  outputs = model.generate(**inputs, max_length=256)

  predicted_label = tokenizer.decode(outputs[0], skip_special_tokens=True) # Decode the generated tokens for this input
  true_label = test['target_block'][i] # Get the true label for this input

  p.write(test['masked_method'][i].replace("<IF-STMT>", predicted_label) )
  a.write(test['masked_method'][i].replace("<IF-STMT>", true_label))

  p.close()
  a.close()

  exact_match = str(true_label == predicted_label)

  start_index = input_code.index("<IF-STMT>")
  extra_length = len(predicted_label) - len(input_code)

  codeBleuRaw = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs /content/actual.txt --hyp /content/predicted.txt --lang java --params 0.25,0.25,0.25,0.25
  codeBleu = codeBleuRaw[1].split()[-1]
  bleu4 = codeBleuRaw[0].split()[2][:-1]

  output_file.loc[i] = [input_code, exact_match, test['target_block'][i], predicted_label, codeBleu, bleu4]

In [18]:
output_file

Unnamed: 0,Masked Method,Exact Match,Expected if Condition,Predicted if Condition,CodeBLEU Score,Bleu4 Score
0,"def read(self, count=True, timeout=None, ignor...",True,if ignore_timeouts and is_timeout ( e ) :,if ignore_timeouts and is_timeout ( e ) :,1.0,1.0
1,"def _cache_mem(curr_out, prev_mem, mem_len, re...",True,if prev_mem is None :,if prev_mem is None :,1.0,1.0
2,def filtered(gen): for example in gen: example...,True,if example_len > max_length :,if example_len > max_length :,1.0,1.0
3,"def search(self, query): # ""Search.ashx?query=...",False,"if item . get ( ""type"" , """" ) == ""audio"" :","if item [ ""guide_id"" ] not in self . _stations :",0.6714819922706515,0.8269827801858844
4,"def _check_script(self, script, directive): fo...",False,"if var . must_contain ( ""/"" ) :",if var . name in directive :,0.9070425121090596,0.8967386308733728
...,...,...,...,...,...,...
4995,"def _super_function(args): passed_class, passe...",False,"if isinstance ( pyclass , pyobjects . Abstract...",if pyclass is not None :,0.7213626744066333,0.7561289226114326
4996,"def get_data(row): data = [] for field_name, f...",True,if result :,if result :,1.0,1.0
4997,"def say(jarvis, s): """"""Reads what is typed.""""""...",False,if not voice_state :,if voice_state :,0.9257502295744646,0.891237063632542
4998,"def __import__(name, globals=None, locals=None...",True,"if ""*"" in fromlist :","if ""*"" in fromlist :",1.0,1.0


In [19]:
output_file.to_csv("testset-results.csv")