In [None]:
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from datasets import load_dataset
import pandas as pd
import re
from datasets import Dataset
import autopep8
import sacrebleu
import codebleu
import os
from datasets import Dataset
import torch
import evaluate
from codebleu import calc_codebleu
from tqdm import tqdm
import numpy as np

In [None]:
# ------------------------
# 1. Install Required Libraries
# ------------------------
#!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
#!pip install transformers datasets evaluate -q

In [2]:
#https://huggingface.co/Salesforce/codet5-small
# ------------------------------------------------------------------------
# 2. Load Dataset (CodeXGLUE - Code Translation Java <=> C#)
# ------------------------------------------------------------------------
data_dir = r"C:\Users\bentr\Downloads\Archive\Archive"

# CodeXGLUE is a benchmark dataset collection by Microsoft for code-related tasks.
# Here, we use the code-translation-python-java dataset.
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
# Read the CSV files into DataFrames
test_dataset = load_dataset('csv', data_files=os.path.join(data_dir, csv_files[0]))['train']
train_dataset = load_dataset('csv', data_files=os.path.join(data_dir, csv_files[1]))['train']
validation_dataset = load_dataset('csv', data_files=os.path.join(data_dir, csv_files[2]))['train']

dataset = DatasetDict({
    'test': test_dataset,
    'train': train_dataset,
    'validation': validation_dataset
})
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['cleaned_method', 'target_block', 'tokens_in_method'],
        num_rows: 5000
    })
    train: Dataset({
        features: ['cleaned_method', 'target_block', 'tokens_in_method'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['cleaned_method', 'target_block', 'tokens_in_method'],
        num_rows: 5000
    })
})


In [6]:
model_checkpoint = "Salesforce/codet5-small"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(["<MASK>"]) #Imagine we need an extra token. This line adds the extra token to the vocabulary

model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32101, 512)

In [3]:
def mask_dataset(dataset, datatype):
    if datatype == "test" or "validation": max = 499
    if datatype == "train": max = 2499
    processed_methods = []
    processed_targets = []
    i = 0

    # Loop through the dataset and apply processing
    yes = 0
    no = 0
    while i <= max:
        # Get the current method and target block
            if (i + 1) % 250 == 0: print(f"Processed {i + 1}")
            flattened_method = dataset[datatype]["cleaned_method"][i]
            target = dataset[datatype]["target_block"][i]

        # Flatten the method by joining words with a single space
            flattened_method = " ".join(flattened_method.split())
            flattened_method = re.sub(r'\s*([=+\-*/%<>!&|^(),:{}\[\].])\s*', r'\1', flattened_method)

        # Normalize the target block
            target = re.sub(r'\s*([=+\-*/%<>!&|^(),:{}\[\].])\s*', r'\1', target)

        # Replace target with <MASK> in the flattened method
            if target not in flattened_method:
                no+=1
            if target in flattened_method:
                flattened_method = flattened_method.replace(target, "<MASK>")
                yes+=1
                processed_methods.append(flattened_method)
                processed_targets.append(target)
        # Append processed results
            i += 1
    print(yes)
    print(no)
    # Build Dataset (not DatasetDict)
    processed = Dataset.from_dict({
        'processed_target': processed_targets,
        'processed_method': processed_methods,
    })
    return processed
valid = mask_dataset(dataset, "validation")
test = mask_dataset(dataset, "test")
train = mask_dataset(dataset, "train")
print(valid)
print(train)
print(test)

Processed 250
Processed 500
499
1
Processed 250
Processed 500
500
0
Processed 250
Processed 500
Processed 750
Processed 1000
Processed 1250
Processed 1500
Processed 1750
Processed 2000
Processed 2250
Processed 2500
2499
1
Dataset({
    features: ['processed_target', 'processed_method'],
    num_rows: 499
})
Dataset({
    features: ['processed_target', 'processed_method'],
    num_rows: 2499
})
Dataset({
    features: ['processed_target', 'processed_method'],
    num_rows: 500
})


In [4]:
print("Validation size:", len(valid))
print("Test size:", len(test))
print("Train size:", len(train))

Validation size: 499
Test size: 500
Train size: 2499


In [7]:
def preprocess_function(dataset):
    inputs = dataset["processed_method"]
    targets = dataset["processed_target"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



train = train.map(preprocess_function, batched=True)
valid = valid.map(preprocess_function, batched = True)
test = test.map(preprocess_function, batched = True)
print(valid)
print(train)
print(test)
#print(tokenized_datasets)

Map: 100%|██████████| 2499/2499 [00:02<00:00, 1066.36 examples/s]
Map: 100%|██████████| 499/499 [00:00<00:00, 1005.82 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 980.19 examples/s]

Dataset({
    features: ['processed_target', 'processed_method', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 499
})
Dataset({
    features: ['processed_target', 'processed_method', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2499
})
Dataset({
    features: ['processed_target', 'processed_method', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})





In [9]:
# Convert to Pandas DataFrame
dfv = valid.to_pandas()
dftr = train.to_pandas()
dft = test.to_pandas()
# Save to CSV
dftr.to_csv("maskedtrain3_dataset.csv", index=False)
dft.to_csv("maskedtest3_dataset.csv", index=False)
dfv.to_csv("maskedvalid3_dataset.csv", index=False)

In [None]:
valid = Dataset.from_pandas(pd.read_csv("maskedvalid_dataset.csv"))
train = Dataset.from_pandas(pd.read_csv("maskedtrain_dataset.csv"))
test = Dataset.from_pandas(pd.read_csv("maskedtest_dataset.csv"))
print(valid)
print(train)
print(test)

In [10]:
# ------------------------------------------------------------------------
# 5. Define Training Arguments and Trainer
# ------------------------------------------------------------------------


training_args = TrainingArguments(
    output_dir="./codet5-finetuned2",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [11]:
# ------------------------
# 6. Train the Model
# ------------------------
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0582,0.045173
2,0.0432,0.04392
3,0.0322,0.04508
4,0.0245,0.045139


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=5000, training_loss=0.04956385655403137, metrics={'train_runtime': 475.8947, 'train_samples_per_second': 26.256, 'train_steps_per_second': 13.133, 'total_flos': 676438323757056.0, 'train_loss': 0.04956385655403137, 'epoch': 4.0})

In [12]:
save_path = "./codet5-small-finetuned2"

# Save model
trainer.save_model(save_path)

# Save tokenizer
tokenizer.save_pretrained(save_path)

('./codet5-small-finetuned2\\tokenizer_config.json',
 './codet5-small-finetuned2\\special_tokens_map.json',
 './codet5-small-finetuned2\\vocab.json',
 './codet5-small-finetuned2\\merges.txt',
 './codet5-small-finetuned2\\added_tokens.json')

In [13]:

save_path = "./codet5-small-finetuned2"
# Load the saved model
model = T5ForConditionalGeneration.from_pretrained(save_path)

# Load the saved tokenizer
tokenizer = RobertaTokenizer.from_pretrained(save_path)

In [17]:
# ------------------------
# 8. Test Code Translation
# ------------------------
input_code = test["processed_method"][0]
print(test["processed_target"][0])
inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)
outputs = model.generate(**inputs, max_length=256)
print("Generated If Statement:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

if ignore_timeouts and is_timeout(e):
Generated If Statement:
 if ignore_timeouts and is_timeout(e):


In [21]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

all_inputs = test["processed_method"]
batch_size = 8  # start small, increase if your GPU can handle it
decoded_outputs = []

for i in tqdm(range(0, len(all_inputs), batch_size)):
    batch = all_inputs[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)

    # Decode each output
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_outputs.extend(decoded_batch)

# Optional: print a few outputs
for i in range(5):

    print(f"\nInput: {all_inputs[i]}")
    print(f"Prediction: {decoded_outputs[i]}")

100%|██████████| 63/63 [00:19<00:00,  3.28it/s]


Input: def read(self,count=True,timeout=None,ignore_non_errors=True,ignore_timeouts=True):try:return self._read(count,timeout)except usb.USBError as e:if DEBUG_COMM:log.info("read:e.errno=%s e.strerror=%s e.message=%s repr=%s"%(e.errno,e.strerror,e.message,repr(e)))<MASK>return[]if ignore_non_errors and is_noerr(e):return[]raise
Prediction: if ignore_timeouts and is_timeout(e):

Input: def _cache_mem(curr_out,prev_mem,mem_len,reuse_len=None):"""cache hidden states into memory.""" if mem_len is None or mem_len==0:return None else:if reuse_len is not None and reuse_len>0:curr_out=curr_out[:reuse_len]<MASK>new_mem=curr_out[-mem_len:]else:new_mem=tf.concat([prev_mem,curr_out],0)[-mem_len:]new_mem.stop_gradient=True return new_mem
Prediction: if prev_mem is None:

Input: def filtered(gen):for example in gen:example_len=length_fn(example)# Checking max length boundary.if max_length is not None:<MASK>continue # Checking min length boundary.if min_length is not None:if example_len<min_length:




In [None]:
print(len(decoded_outputs))
print(decoded_outputs[1400])

500


IndexError: list index out of range

In [23]:
predictions = decoded_outputs
references = test["processed_target"]

In [28]:
# Evaluate 
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print("SacreBLEU Score: ", results)
language = "python"
# Compute CodeBLEU
#score = calc_codebleu(references, predictions, language)
#print("CodeBLEU Score: ", score)
res = calc_codebleu([[ref] for ref in references], predictions, lang="python")
print(res)
"""
SacreBLEU Score:  {'bleu': 0.8265168183793802, 'precisions': [0.9166666666666666, 0.8181818181818182, 0.8, 0.7777777777777778], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 12, 'reference_length': 12}
['def sum(a, b): return a + b']
['def add(a, b): return a + b']
{'codebleu': 0.8251908791628888, 'ngram_match_score': 0.6434588841607617, 'weighted_ngram_match_score': 0.6573046324907937, 'syntax_match_score': 1.0, 'dataflow_match_score': 1.0}
Exact Match Score: 0.00
"""
print(len(predictions))
print(len(references))
exact_match_score = np.mean([ref == pred for ref, pred in zip(references, predictions)])
print(f"Exact Match Score: {exact_match_score:.2f}")

SacreBLEU Score:  {'bleu': 0.35858156554860066, 'precisions': [0.6691988950276243, 0.43314255983350675, 0.35167464114832536, 0.2945518453427065], 'brevity_penalty': 0.8614215659736089, 'length_ratio': 0.8701923076923077, 'translation_length': 4344, 'reference_length': 4992}
{'codebleu': 0.2178627681932208, 'ngram_match_score': 0.16532385593454665, 'weighted_ngram_match_score': 0.176943220007281, 'syntax_match_score': 0.36447811447811446, 'dataflow_match_score': 0.16470588235294117}
500
500
Exact Match Score: 0.21
