## Load Dataset

In [1]:
import numpy as np
import pandas as pd

import json

In [2]:
file_path = 'Data/All_Data.json'

try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

In [3]:
dataset_df = pd.DataFrame.from_dict(data)

In [4]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,subject chat XMPP title updated updated xmpp u...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,IConnectContext Message IConnection SOContaine...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,ExceptionInInitializerError eclipse eclipse ge...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,deserialize handleAsynchEvent processAsynch Bi...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,Shared createObject ECF launching Group Win Cr...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


### Split the dataset_df into train test

In [5]:
# split into train and test
from sklearn.model_selection import train_test_split

# split into train and test
train_df, test_df = train_test_split(dataset_df, test_size=0.15, random_state=42)

In [6]:
type(train_df)

pandas.core.frame.DataFrame

In [7]:
train_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...
457,21792,[org.eclipse.jdt.launching/launching/org/eclip...,eclipse.jdt.debug,argument Duser arguments passed argument dir e...,– vm arguments ending with a backslash cause ...,"When creating a launch configuration, if one s..."
1530,221019,[ui/org.eclipse.pde.core/src/org/eclipse/pde/i...,eclipse.pde.ui,bundle container entries Require Bug Bug class...,– Duplicated entries in classpath container,When a bundle is added as a Require-Bundle to ...


In [8]:
train_df.shape

(1972, 6)

## ML Works

In [8]:
import transformers

print(transformers.__version__)

4.30.2


In [9]:
model_checkpoint = "ml6team/keyphrase-generation-t5-small-inspec"

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [13]:
train_df.columns

Index(['bug_id', 'ground_truth', 'repo', 'reformed_query', 'bug_title',
       'bug_description'],
      dtype='object')

In [15]:
max_input_length = 1024
max_target_length = 20

def preprocess_function(df):
    # inputs = [prefix + doc for doc in df["bug_description"]]
    inputs = df["bug_description"].tolist()
    labels = df["reformed_query"].tolist()

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    model_inputs["labels"] = tokenizer(labels, max_length=max_target_length, truncation=True, padding="max_length")["input_ids"]

    return model_inputs

In [15]:
temp_df = train_df.head(3)

In [16]:
temp_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2288,58110,[java/org/apache/jasper/compiler/ErrorDispatch...,tomcat70,UTF JDT Wrapper JspCompilationContext size Def...,– JSP compiler points error to wrong line num...,Created attachment 32888 [details]\nApache Tom...
2208,55656,[java/org/apache/catalina/startup/Catalina.jav...,tomcat70,patch realms Loader server thrown classes load...,– Server ClassLoader not used for Server crea...,Created attachment 30931 [details]\nproposed p...
1036,340338,[org.eclipse.jdt.ui/ui/org/eclipse/jdt/interna...,eclipse.jdt.ui,invoking overwrites select select Ctrl charAtB...,– [content assist] Proposal does not replace ...,3.1.\nInserting a proposal does not replace th...


In [17]:
sr = preprocess_function(temp_df)


In [18]:
type(sr)

transformers.tokenization_utils_base.BatchEncoding

In [19]:
print(sr)

{'input_ids': [[6357, 26, 11352, 3538, 10927, 784, 221, 5756, 7, 908, 24263, 3059, 2138, 834, 26346, 5, 4241, 3, 18, 848, 52, 127, 934, 5, 10500, 27, 15687, 12, 9268, 8, 336, 1205, 96, 121, 2493, 6, 11, 446, 4274, 2890, 699, 500, 3505, 12, 689, 381, 1713, 927, 84, 19, 337, 38, 8, 336, 1205, 2493, 5, 3636, 10, 3, 14817, 14817, 18, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 3, 2, 1454, 1741, 543, 543, 8532, 9886, 2423, 31, 6675, 371, 6039, 31, 738, 25160, 2423, 31, 6327, 87, 10500, 31, 1454, 3155, 3, 2, 1454, 1741, 543, 4830, 3274, 3, 31, 27578, 5, 13780, 5, 1935, 31, 3, 1454, 3155, 3, 2, 1454, 55, 22341, 794, 41, 61, 3, 2, 6792, 2, 17057, 3155, 570, 17057, 21486, 15, 7, 3274, 206, 195, 117, 3, 99, 41, 3350, 17057, 21486, 15, 7, 2423, 2423, 29, 83, 40, 1820, 9175, 570, 17057, 21486, 15, 7, 5, 7991, 9960, 2423, 2423, 

In [13]:
tokenized_train_df = preprocess_function(train_df)

In [21]:
# Assuming you have the tokenized_train_df after calling preprocess_function on train_df

# Print the first two examples in tokenized_train_df
for i in range(2):
    print("Example", i+1)
    print("Inputs:", tokenized_train_df["input_ids"][i])
    print("Attention Mask:", tokenized_train_df["attention_mask"][i])
    print("Labels:", tokenized_train_df["labels"][i])
    print()


Example 1
Inputs: [6357, 26, 11352, 3538, 10927, 784, 221, 5756, 7, 908, 24263, 3059, 2138, 834, 26346, 5, 4241, 3, 18, 848, 52, 127, 934, 5, 10500, 27, 15687, 12, 9268, 8, 336, 1205, 96, 121, 2493, 6, 11, 446, 4274, 2890, 699, 500, 3505, 12, 689, 381, 1713, 927, 84, 19, 337, 38, 8, 336, 1205, 2493, 5, 3636, 10, 3, 14817, 14817, 18, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 14817, 3, 2, 1454, 1741, 543, 543, 8532, 9886, 2423, 31, 6675, 371, 6039, 31, 738, 25160, 2423, 31, 6327, 87, 10500, 31, 1454, 3155, 3, 2, 1454, 1741, 543, 4830, 3274, 3, 31, 27578, 5, 13780, 5, 1935, 31, 3, 1454, 3155, 3, 2, 1454, 55, 22341, 794, 41, 61, 3, 2, 6792, 2, 17057, 3155, 570, 17057, 21486, 15, 7, 3274, 206, 195, 117, 3, 99, 41, 3350, 17057, 21486, 15, 7, 2423, 2423, 29, 83, 40, 1820, 9175, 570, 17057, 21486, 15, 7, 5, 7991, 9960, 2423, 242

In [14]:
tokenized_test_df = preprocess_function(test_df)

### Fine tuning

In [16]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [19]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)