## Load Dataset

In [1]:
import numpy as np
import pandas as pd

import json

In [2]:
file_path = '../Data/All_Data.json'

try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

In [3]:
dataset_df = pd.DataFrame.from_dict(data)

In [4]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,subject chat XMPP title updated updated xmpp u...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,IConnectContext Message IConnection SOContaine...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,ExceptionInInitializerError eclipse eclipse ge...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,deserialize handleAsynchEvent processAsynch Bi...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,Shared createObject ECF launching Group Win Cr...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


### Split the dataset_df into train test

In [5]:
# split into train and test
from sklearn.model_selection import train_test_split

# split into train and test
train_df, test_df = train_test_split(dataset_df, test_size=0.15, random_state=42)

In [6]:
valid_df, test_df = train_test_split(test_df, test_size=0.35, random_state=42)

In [7]:
type(train_df)

pandas.core.frame.DataFrame

In [8]:
train_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...
457,21792,[org.eclipse.jdt.launching/launching/org/eclip...,eclipse.jdt.debug,argument Duser arguments passed argument dir e...,– vm arguments ending with a backslash cause ...,"When creating a launch configuration, if one s..."
1530,221019,[ui/org.eclipse.pde.core/src/org/eclipse/pde/i...,eclipse.pde.ui,bundle container entries Require Bug Bug class...,– Duplicated entries in classpath container,When a bundle is added as a Require-Bundle to ...


In [9]:
train_df.shape

(1972, 6)

## ML Works

### Evaluation Module

In [10]:
import evaluate
from evaluate import load, list_evaluation_modules
#list of metrics
print(list_evaluation_modules())

ModuleNotFoundError: No module named 'evaluate'

In [None]:
metric = load("bleu")

In [None]:
print(metric)

In [None]:
test_pred = ["this is a test", "this is another test"]
test_true = ["this is a test", "this is another test"]
metric.compute(predictions=test_pred, references=test_true)

### transofrmers

In [None]:
from transformers import (
    Text2TextGenerationPipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)

import transformers



print(transformers.__version__)

In [None]:
# model_checkpoint = "ml6team/keyphrase-generation-t5-small-inspec"
model_checkpoint = "ml6team/keyphrase-generation-t5-small-inspec"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b", "Salesforce/codet5-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
train_df.columns

In [None]:
max_input_length = 1024
max_target_length = 50

keyphrase_sep_token = ';'

def preprocess_function(bug_description, reformed_query):
    # Assuming you have the tokenizer initialized and named as 'tokenizer'

    document_inputs = tokenizer(
        bug_description,
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt"
    )

    keyphrases = reformed_query.split()

    target_text = f" {keyphrase_sep_token} ".join(keyphrases)
    targets = tokenizer(
        target_text,
        padding="max_length",
        truncation=True,
        max_length=max_target_length,
        return_tensors="pt"  # Ensure PyTorch tensors are returned
    )
    labels = targets.input_ids  # Clone the input_ids tensor to create the labels
    labels[labels == tokenizer.pad_token_id] = -100  # Set padding token to -100
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    model_inputs = {
        "input_ids": document_inputs["input_ids"],
        "attention_mask": document_inputs["attention_mask"],
        "labels": labels,
    }

    return model_inputs


In [None]:
temp_df = train_df.head(3)

In [None]:
temp_df.head()

In [None]:
sr = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in temp_df.iterrows()]

In [None]:
type(sr)

In [None]:
print(sr)

In [None]:
train_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in train_df.iterrows()]
test_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in test_df.iterrows()]
valid_data = [preprocess_function(row["bug_description"], row["reformed_query"]) for _, row in valid_df.iterrows()]

In [None]:
from datasets import Dataset, concatenate_datasets
tokenized_datasets = {
    'train': train_data,
    'test': test_data,
    'validation': valid_data
}

### Fine tuning

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:

# Adjust the batch size based on available GPU memory
batch_size = 16

# Set up the Seq2SeqTrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = Seq2SeqTrainingArguments(
    f"{model_name}-T5_keyphrase",  # Change the output directory name
    learning_rate=2e-5,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

### Trainer

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### Train

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
train_results = trainer.train()

In [None]:
# from tqdm.notebook import tqdm
#
# # Initialize tqdm progress bar
# progress_bar = tqdm(total=training_args.num_train_epochs, desc="Epoch")
#
# # Train the model with tqdm progress bar
# for epoch in range(training_args.num_train_epochs):
#     trainer.train()
#     progress_bar.update(1)
#
# # Close the tqdm progress bar
# progress_bar.close()

In [None]:
# print("Training Losses:", train_results.training_loss)
# print("Evaluation Metrics:", train_results.metrics)

#### Save the model

In [None]:
output_directory = "../FineTunedModels/T5_keyphrase"

In [None]:
# Save the trained model and configuration
trainer.save_model(output_directory)


### Load the model

In [None]:
# Load the fine-tuned model
# model = AutoModelForSeq2SeqLM.from_pretrained(output_directory)


### Evaluate with test data set

In [None]:
# Evaluate the model on the test dataset
# test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])

In [None]:
# see output with test data
# print("Evaluation Metrics:", test_results.metrics)


## Generate Output

In [None]:
# print the actual output
print(test_df.iloc[0]["reformed_query"])

In [None]:
model.to('cpu')

# Move the input tensor to the CPU
input_ids = tokenizer(test_df.iloc[1]["bug_description"], return_tensors="pt").input_ids.to('cpu')

# Generate the output
output = model.generate(input_ids, max_length=40, num_beams=10, early_stopping=True, top_k=50, num_return_sequences=5)

# Decode and print the output
print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
print(test_df.iloc[1]["bug_description"])

In [None]:
print(test_df.iloc[1]["reformed_query"])

In [None]:
print(test_df.iloc[1]["bug_description"])
input_ids = tokenizer(test_df.iloc[1]["bug_description"], return_tensors="pt").input_ids
output = model.generate(input_ids, max_length=40, num_beams=60, early_stopping=True, top_k=50, num_return_sequences=5)
print(tokenizer.decode(output[0], skip_special_tokens=True))
print(len(output))

# iterate through output and print
for i in range(len(output)):
    print(tokenizer.decode(output[i], skip_special_tokens=True))