## Load Dataset

In [1]:
import numpy as np
import pandas as pd

import json

In [4]:
file_path = '../Data/All_Data.json'

try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

In [5]:
dataset_df = pd.DataFrame.from_dict(data)

In [6]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,subject chat XMPP title updated updated xmpp u...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,IConnectContext Message IConnection SOContaine...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,ExceptionInInitializerError eclipse eclipse ge...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,deserialize handleAsynchEvent processAsynch Bi...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,Shared createObject ECF launching Group Win Cr...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


### Split the dataset_df into train test

In [7]:
# split into train and test
from sklearn.model_selection import train_test_split

# split into train and test
train_df, test_df = train_test_split(dataset_df, test_size=0.15, random_state=42)

In [8]:
valid_df, test_df = train_test_split(test_df, test_size=0.35, random_state=42)

In [9]:
type(train_df)

pandas.core.frame.DataFrame

In [10]:
train_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...
457,21792,[org.eclipse.jdt.launching/launching/org/eclip...,eclipse.jdt.debug,argument Duser arguments passed argument dir e...,– vm arguments ending with a backslash cause ...,"When creating a launch configuration, if one s..."
1530,221019,[ui/org.eclipse.pde.core/src/org/eclipse/pde/i...,eclipse.pde.ui,bundle container entries Require Bug Bug class...,– Duplicated entries in classpath container,When a bundle is added as a Require-Bundle to ...


In [11]:
train_df.shape

(1972, 6)

## ML Works

### Evaluation Module

In [12]:
import evaluate
from evaluate import load, list_evaluation_modules
#list of metrics
print(list_evaluation_modules())

['precision', 'code_eval', 'roc_auc', 'cuad', 'xnli', 'rouge', 'pearsonr', 'mse', 'super_glue', 'comet', 'cer', 'sacrebleu', 'mahalanobis', 'wer', 'competition_math', 'f1', 'recall', 'coval', 'mauve', 'xtreme_s', 'bleurt', 'ter', 'accuracy', 'exact_match', 'indic_glue', 'spearmanr', 'mae', 'squad', 'chrf', 'glue', 'perplexity', 'mean_iou', 'squad_v2', 'meteor', 'bleu', 'wiki_split', 'sari', 'frugalscore', 'google_bleu', 'bertscore', 'matthews_correlation', 'seqeval', 'trec_eval', 'rl_reliability', 'angelina-wang/directional_bias_amplification', 'cpllab/syntaxgym', 'kaggle/ai4code', 'codeparrot/apps_metric', 'mfumanelli/geometric_mean', 'poseval', 'brier_score', 'abidlabs/mean_iou', 'abidlabs/mean_iou2', 'giulio98/codebleu', 'mase', 'mape', 'smape', 'dvitel/codebleu', 'NCSOFT/harim_plus', 'JP-SystemsX/nDCG', 'Drunper/metrica_tesi', 'jpxkqx/peak_signal_to_noise_ratio', 'jpxkqx/signal_to_reconstruction_error', 'hpi-dhc/FairEval', 'nist_mt', 'lvwerra/accuracy_score', 'character', 'charcut_

In [13]:
metric = load("bleu")

In [14]:
print(metric)

EvaluationModule(name: "bleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Computes BLEU score of translated segments against one or more references.
Args:
    predictions: list of translations to score.
    references: list of lists of or just a list of references for each translation.
    tokenizer : approach used for tokenizing `predictions` and `references`.
        The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
        This can be replaced by any function that takes a string as input and returns a list of tokens as output.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoot

In [15]:
test_pred = ["this is a test", "this is another test"]
test_true = ["this is a test", "this is another test"]
metric.compute(predictions=test_pred, references=test_true)

{'bleu': 1.0,
 'precisions': [1.0, 1.0, 1.0, 1.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 8,
 'reference_length': 8}

### transofrmers

In [16]:
from transformers import (
    Text2TextGenerationPipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)

import transformers



print(transformers.__version__)

4.31.0


In [17]:
# model_checkpoint = "ml6team/keyphrase-generation-t5-small-inspec"
model_checkpoint = "ml6team/keyphrase-generation-t5-small-inspec"

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [19]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b", "Salesforce/codet5-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [20]:
train_df.columns

Index(['bug_id', 'ground_truth', 'repo', 'reformed_query', 'bug_title',
       'bug_description'],
      dtype='object')

In [69]:
max_input_length = 1024
max_target_length = 20

keyphrase_sep_token = ';'

def preprocess_function(bug_description, reformed_query):
    # Assuming you have the tokenizer initialized and named as 'tokenizer'

    document_inputs = tokenizer(
        bug_description,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )

    keyphrases = reformed_query.split()

    target_text = f" {keyphrase_sep_token} ".join(keyphrases)
    targets = tokenizer(
        target_text,
        padding="max_length",
        truncation=True,
        max_length=max_target_length,
        # return_tensors="pt",  # Ensure PyTorch tensors are returned
    )
    labels = targets.input_ids  # Clone the input_ids tensor to create the labels
    labels[labels == tokenizer.pad_token_id] = -100  # Set padding token to -100
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    model_inputs = {
        "input_ids": document_inputs["input_ids"],
        "attention_mask": document_inputs["attention_mask"],
        "labels": labels,
    }

    return model_inputs


In [70]:
temp_df = train_df.head(3)

In [71]:
temp_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...


In [72]:
sr = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in temp_df.iterrows()]

In [58]:
type(sr)

list

In [64]:
print(sr)

[{'input_ids': [6357, 26, 11352, 3538, 10927, 784, 221, 5756, 7, 908, 24263, 3059, 2138, 834, 26346, 5, 4241, 3, 18, 848, 52, 127, 934, 5, 10500, 27, 15687, 12, 9268, 8, 336, 1205, 96, 121, 2493, 6, 11, 446, 4274, 2890, 699, 500, 3505, 12, 689, 381, 1713, 927, 84, 19, 337, 38, 8, 336, 1205, 2493, 5, 3636, 10, 3, 14817, 14817, 18, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 3, 2, 1454, 1741, 543, 543, 8532, 9886, 2423, 31, 6675, 371, 6039, 31, 738, 25160, 2423, 31, 6327, 87, 10500, 31, 1454, 3155, 3, 2, 1454, 1741, 543, 4830, 3274, 3, 31, 27578, 5, 13780, 5, 1935, 31, 3, 1454, 3155, 3, 2, 1454, 55, 22341, 794, 41, 61, 3, 2, 6792, 2, 17057, 3155, 570, 17057, 21486, 15, 7, 3274, 206, 195, 117, 3, 99, 41, 3350, 17057, 21486, 15, 7, 2423, 2423, 29, 83, 40, 1820, 9175, 570, 17057, 21486, 15, 7, 5, 7991, 9960, 2423, 2423, 

In [73]:
train_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in train_df.iterrows()]
test_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in test_df.iterrows()]
valid_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in valid_df.iterrows()]

In [74]:
from datasets import Dataset, concatenate_datasets
tokenized_datasets = {
    'train': train_data,
    'test': test_data,
    'validation': valid_data
}

### Fine tuning

In [75]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [76]:

# Adjust the batch size based on available GPU memory
batch_size = 16

# Set up the Seq2SeqTrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    f"{model_name}-T5_keyphrase",  # Change the output directory name
    learning_rate=2e-5,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)


In [77]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

### Trainer

In [78]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### Train

In [79]:
trainer = Seq2SeqTrainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [80]:
train_results = trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


TypeError: type list doesn't define __round__ method

In [None]:
# from tqdm.notebook import tqdm
#
# # Initialize tqdm progress bar
# progress_bar = tqdm(total=training_args.num_train_epochs, desc="Epoch")
#
# # Train the model with tqdm progress bar
# for epoch in range(training_args.num_train_epochs):
#     trainer.train()
#     progress_bar.update(1)
#
# # Close the tqdm progress bar
# progress_bar.close()

In [None]:
# print("Training Losses:", train_results.training_loss)
# print("Evaluation Metrics:", train_results.metrics)

#### Save the model

In [53]:
output_directory = "../FineTunedModels/T5_keyphrase"

In [None]:
# Save the trained model and configuration
trainer.save_model(output_directory)


### Load the model

In [54]:
# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained(output_directory)


### Evaluate with test data set

In [55]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

In [None]:
# see output with test data
print("Evaluation Metrics:", test_results.metrics)


## Generate Output

In [86]:
# Assuming you have a CUDA-capable device, check if it's available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device
model.to(device)

# Assuming 'text' is a string, convert it to a list to use the generator
text = test_df.iloc[1]["bug_description"]
keyphrases = generator([text])  # Convert 'text' to a list and pass it to the generator

print(keyphrases)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [57]:
# print the actual output
print(test_df.iloc[0]["reformed_query"])

print type analyzeCode Local expressions missing compiler System lambda


In [88]:
model.to('cpu')

# Move the input tensor to the CPU
input_ids = tokenizer(test_df.iloc[1]["bug_description"], return_tensors="pt").input_ids.to('cpu')

# Generate the output
output = model.generate(input_ids, max_length=20, num_beams=10, early_stopping=True, top_k=50, num_return_sequences=5)

# Decode and print the output
print(tokenizer.decode(output[0], skip_special_tokens=True))

patch applicationcontextfacade ; type safety ; unchecked cast 
