## Load Dataset

In [2]:
import numpy as np
import pandas as pd

import json

In [3]:
file_path = 'Data/All_Data.json'

try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

In [4]:
dataset_df = pd.DataFrame.from_dict(data)

In [5]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,subject chat XMPP title updated updated xmpp u...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,IConnectContext Message IConnection SOContaine...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,ExceptionInInitializerError eclipse eclipse ge...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,deserialize handleAsynchEvent processAsynch Bi...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,Shared createObject ECF launching Group Win Cr...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


### Split the dataset_df into train test

In [6]:
# split into train and test
from sklearn.model_selection import train_test_split

# split into train and test
train_df, test_df = train_test_split(dataset_df, test_size=0.15, random_state=42)

In [7]:
valid_df, test_df = train_test_split(test_df, test_size=0.35, random_state=42)

In [8]:
type(train_df)

pandas.core.frame.DataFrame

In [9]:
train_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...
457,21792,[org.eclipse.jdt.launching/launching/org/eclip...,eclipse.jdt.debug,argument Duser arguments passed argument dir e...,– vm arguments ending with a backslash cause ...,"When creating a launch configuration, if one s..."
1530,221019,[ui/org.eclipse.pde.core/src/org/eclipse/pde/i...,eclipse.pde.ui,bundle container entries Require Bug Bug class...,– Duplicated entries in classpath container,When a bundle is added as a Require-Bundle to ...


In [10]:
train_df.shape

(1972, 6)

## ML Works

### Evaluation Module

In [11]:
import evaluate
from evaluate import load, list_evaluation_modules
#list of metrics
print(list_evaluation_modules())

['precision', 'code_eval', 'roc_auc', 'cuad', 'xnli', 'rouge', 'pearsonr', 'mse', 'super_glue', 'comet', 'cer', 'sacrebleu', 'mahalanobis', 'wer', 'competition_math', 'f1', 'recall', 'coval', 'mauve', 'xtreme_s', 'bleurt', 'ter', 'accuracy', 'exact_match', 'indic_glue', 'spearmanr', 'mae', 'squad', 'chrf', 'glue', 'perplexity', 'mean_iou', 'squad_v2', 'meteor', 'bleu', 'wiki_split', 'sari', 'frugalscore', 'google_bleu', 'bertscore', 'matthews_correlation', 'seqeval', 'trec_eval', 'rl_reliability', 'angelina-wang/directional_bias_amplification', 'cpllab/syntaxgym', 'kaggle/ai4code', 'codeparrot/apps_metric', 'mfumanelli/geometric_mean', 'poseval', 'brier_score', 'abidlabs/mean_iou', 'abidlabs/mean_iou2', 'giulio98/codebleu', 'mase', 'mape', 'smape', 'dvitel/codebleu', 'NCSOFT/harim_plus', 'JP-SystemsX/nDCG', 'Drunper/metrica_tesi', 'jpxkqx/peak_signal_to_noise_ratio', 'jpxkqx/signal_to_reconstruction_error', 'hpi-dhc/FairEval', 'nist_mt', 'lvwerra/accuracy_score', 'character', 'charcut_

In [12]:
metric = load("bleu")

In [13]:
print(metric)

EvaluationModule(name: "bleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Computes BLEU score of translated segments against one or more references.
Args:
    predictions: list of translations to score.
    references: list of lists of or just a list of references for each translation.
    tokenizer : approach used for tokenizing `predictions` and `references`.
        The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
        This can be replaced by any function that takes a string as input and returns a list of tokens as output.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoot

In [14]:
test_pred = ["this is a test", "this is another test"]
test_true = ["this is a test", "this is another test"]
metric.compute(predictions=test_pred, references=test_true)

{'bleu': 1.0,
 'precisions': [1.0, 1.0, 1.0, 1.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 8,
 'reference_length': 8}

### transofrmers

In [15]:
import transformers

print(transformers.__version__)

4.31.0


In [16]:
# model_checkpoint = "ml6team/keyphrase-generation-t5-small-inspec"
model_checkpoint = "Salesforce/codet5-small"

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [18]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b", "Salesforce/codet5-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [19]:
train_df.columns

Index(['bug_id', 'ground_truth', 'repo', 'reformed_query', 'bug_title',
       'bug_description'],
      dtype='object')

In [20]:
max_input_length = 1024
max_target_length = 20


def preprocess_function(bug_description, reformed_query):
    # Tokenize the 'bug_description' (document)
    document_inputs = tokenizer(bug_description, max_length=max_input_length, truncation=True)

    # Tokenize the 'reformed_query' (keywords)
    keywords_inputs = tokenizer(reformed_query, max_length=max_target_length, truncation=True)

    # Combine both inputs in the 'model_inputs' dictionary
    model_inputs = {
        "input_ids": document_inputs["input_ids"],
        "attention_mask": document_inputs["attention_mask"],
        "labels": keywords_inputs["input_ids"],
    }

    return model_inputs

keyphrase_sep_token = ';'
def preprocess_function(samples):
    processed_samples = {"input_ids": [], "attention_mask": [], "labels": []}
    for i, sample in enumerate(samples['bug_description']):
        input_text = " ".join(sample)
        inputs = tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
        )

        keyphrases = sample['reformed_query']

        target_text = f" {keyphrase_sep_token} ".join(keyphrases)

        with tokenizer.as_target_tokenizer():
            targets = tokenizer(
                target_text,
                max_length=20,
                padding="max_length",
                truncation=True,
                return_tensors="pt",  # Ensure PyTorch tensors are returned
            )
            labels = targets.input_ids.clone()  # Clone the input_ids tensor to create the labels
            labels[labels == tokenizer.pad_token_id] = -100  # Set padding token to -100

        for key in inputs.keys():
            processed_samples[key].append(inputs[key])
        processed_samples["labels"].append(labels)

    return processed_samples

In [21]:
temp_df = train_df.head(3)

In [22]:
temp_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...


In [23]:
sr = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in temp_df.iterrows()]


In [24]:
type(sr)

list

In [25]:
print(sr)

[{'input_ids': [1, 6119, 6042, 3847, 5482, 28, 306, 6395, 65, 203, 1294, 807, 399, 362, 2574, 67, 27, 18, 20, 18, 9498, 300, 1068, 2605, 18, 2620, 203, 45, 364, 13212, 358, 1430, 326, 1142, 327, 1408, 3021, 16, 471, 804, 3118, 4074, 1634, 555, 358, 980, 1300, 468, 28, 1492, 353, 1967, 487, 326, 1142, 327, 3021, 18, 203, 1085, 30, 203, 5802, 2443, 553, 203, 32, 9, 36, 1363, 1363, 4705, 2218, 5159, 17, 28, 11, 5064, 2218, 955, 19, 2620, 11, 9, 34, 203, 32, 9, 36, 1363, 1930, 273, 296, 6290, 18, 1367, 25664, 738, 34, 203, 32, 9, 5, 203, 780, 1842, 1832, 203, 95, 203, 682, 32, 921, 34, 666, 921, 4818, 273, 446, 31, 203, 430, 261, 1098, 921, 4818, 631, 2011, 747, 666, 921, 4818, 18, 1467, 1435, 631, 20, 13, 203, 2463, 1408, 31, 203, 780, 272, 2040, 5554, 812, 273, 446, 31, 203, 1884, 261, 921, 23992, 294, 666, 921, 4818, 13, 203, 95, 203, 759, 87, 2040, 5554, 812, 273, 315, 3535, 17, 5149, 17, 1612, 797, 10951, 397, 261, 780, 13, 1612, 797, 397, 3552, 6446, 14432, 203, 97, 203, 6494, 353, 2

In [26]:
train_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in train_df.iterrows()]
test_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in test_df.iterrows()]
valid_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in valid_df.iterrows()]

In [27]:
from datasets import Dataset, concatenate_datasets
tokenized_datasets = {
    'train': train_data,
    'test': test_data,
    'validation': valid_data
}

### Fine tuning

In [28]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [51]:

# Adjust the batch size based on available GPU memory
batch_size = 16

# Set up the Seq2SeqTrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-query-codet5",  # Change the output directory name
    learning_rate=2e-5,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)


In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Trainer

In [52]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### Train

In [50]:
trainer = Seq2SeqTrainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# train_results = trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [None]:
from tqdm.notebook import tqdm

# Initialize tqdm progress bar
progress_bar = tqdm(total=training_args.num_train_epochs, desc="Epoch")

# Train the model with tqdm progress bar
for epoch in range(training_args.num_train_epochs):
    trainer.train()
    progress_bar.update(1)

# Close the tqdm progress bar
progress_bar.close()

In [None]:
# Access training metrics after training
train_results = trainer.train()

In [None]:
print("Training Losses:", train_results.training_loss)
print("Evaluation Metrics:", train_results.metrics)

#### Save the model

In [53]:
output_directory = "./FineTunedModels/QueryCodeT5"

In [None]:
# Save the trained model and configuration
trainer.save_model(output_directory)

### Load the model

In [54]:
# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained(output_directory)

### Evaluate with test data set

In [55]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

In [None]:
# see output with test data
print("Evaluation Metrics:", test_results.metrics)


In [66]:
# get input and output from test data
print(test_df.iloc[1]["bug_description"])
input_ids = tokenizer(test_df.iloc[1]["bug_description"], return_tensors="pt").input_ids
output = model.generate(input_ids, max_length=20, num_beams=10, early_stopping=True, top_k=50, num_return_sequences=5)
print(tokenizer.decode(output[0], skip_special_tokens=True))
print(len(output))

# iterate through output and print
for i in range(len(output)):
    print(tokenizer.decode(output[i], skip_special_tokens=True))

Created attachment 24618 [details]
Patch
These can be suppressed with the patch to follow.
Bug bug bug bug bug bug bug bug
5
Bug bug bug bug bug bug bug bug
cast bug bug bug bug bug bug bug
Bug bug bug bug bug bug bug
cast bug bug bug bug bug bug
Bug bug bug bug bug bug


In [57]:
# print the actual output
print(test_df.iloc[0]["reformed_query"])

print type analyzeCode Local expressions missing compiler System lambda
