In [8]:
import pandas as pd
import os

In [9]:
base_path = 'D:\Research\Data\QueryReformulaiton\Forgotten_rule_query_reformulation-master\\'
ground_truth_path = base_path + '\GroundTruth'
reformulated_query_path = base_path + 'NrOptimal-GA\Query'
bug_report_path = base_path + '\BugReports\ALL'

In [10]:
# read folder names from bug report folder
bug_report_folders = os.listdir(bug_report_path)
# repositories =

In [11]:
bug_repos = bug_report_folders

In [None]:
# from ground truth, based on bug repos, read the ground truth. for each bug repo, there are multiple ground truth files
# for each repo create a dataframe with columns: bug_id, which is the file name, and the file associated with it as ground truth




## first dataframe with bug_id and ground truth

In [12]:
# iterate over bug_repos and read the ground truth files
ground_truth_df = pd.DataFrame(columns=['bug_id', 'ground_truth', 'repo'])
for repo in bug_repos:
    cur_repo_path = ground_truth_path + '\\' + repo
    ground_truth_files = os.listdir(cur_repo_path)
    for file in ground_truth_files:
        cur_file_path = cur_repo_path + '\\' + file
        bug_id, _ = os.path.splitext(file)
        # read the content of the file as a string as ground truth
        with open(cur_file_path, 'r') as f:
            # one bug can have multiple ground truths. each line is a ground truth. read line wise add to a list
            ground_truth = f.readlines()
            # remove the new line character from the end of each line
            ground_truth = [x.strip() for x in ground_truth]


        # Creating a new DataFrame for the new row
        new_row = pd.DataFrame({'bug_id': [bug_id], 'ground_truth': [ground_truth], 'repo': [repo]})

        # Concatenate the new row DataFrame with the existing DataFrame
        ground_truth_df = pd.concat([ground_truth_df, new_row], ignore_index=True)

In [13]:
ground_truth_df.head()

Unnamed: 0,bug_id,ground_truth,repo
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf


## Second dataframe with bug_id and reformulated query

In [14]:
# iterate over bug_repos and read the ground truth files
which_file = 'GA-ALL-QE-K10-1.txt'
reformed_query_df = pd.DataFrame(columns=['bug_id', 'reformed_query'])
for repo in bug_repos:
    cur_query_file = reformulated_query_path + '\\' + repo + '\\' + which_file
    # read the file line by line. each line first few characters are the bug id separated by a tab; then the query
    with open(cur_query_file, 'r') as f:
        for line in f:
            bug_id = line.split('\t')[0]
            query = line.split('\t')[1]
            # remove the new line character from the end of each line
            query = query.strip()

            # Creating a new DataFrame for the new row
            new_row = pd.DataFrame({'bug_id': [bug_id], 'reformed_query': [query]})

            # Concatenate the new row DataFrame with the existing DataFrame
            reformed_query_df = pd.concat([reformed_query_df, new_row], ignore_index=True)




In [15]:
reformed_query_df.head()

Unnamed: 0,bug_id,reformed_query
0,209410,handling HTMLified Bug comments properties bot...
1,211585,parsing Discovery authority naming stumbles pa...
2,318086,ecf discovery
3,238976,remote Bug shared side originator editor disco...
4,193415,disconnect press Account Contacts menu logged ...


## Third dataframe with bug_id and bug report

In [16]:
# iterate over bug_repos and read the ground truth files
bug_report_df = pd.DataFrame(columns=['bug_id', 'bug_title', 'bug_description'])
for repo in bug_repos:
    cur_repo_path = bug_report_path + '\\' + repo
    bug_report_files = os.listdir(cur_repo_path)
    for file in bug_report_files:
        cur_file_path = cur_repo_path + '\\' + file
        bug_id, _ = os.path.splitext(file)
        # read the content of the file as a string as ground truth
        with open(cur_file_path, 'r') as f:
            bug_title = f.readline()
            # remove the new line character from the end of each line
            bug_title = bug_title.strip()

            # replace the string format 'Bug+space+bug_id' from the beginning of the title
            bug_title = bug_title.replace('Bug ' + bug_id, '')
            bug_title = bug_title.replace(bug_id, '')

            bug_description = f.read()

        # Creating a new DataFrame for the new row
        new_row = pd.DataFrame({'bug_id': [bug_id], 'bug_title': [bug_title], 'bug_description': [bug_description]})

        # Concatenate the new row DataFrame with the existing DataFrame
        bug_report_df = pd.concat([bug_report_df, new_row], ignore_index=True)

In [17]:
bug_report_df.head()

Unnamed: 0,bug_id,bug_title,bug_description
0,112599,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


## Merge the three dataframes

In [18]:
# merge the three dataframes based on 'bug_id'

In [19]:
merged_df = pd.merge(ground_truth_df, reformed_query_df, on='bug_id')

In [20]:
merged_df = pd.merge(merged_df, bug_report_df, on='bug_id')

In [21]:
merged_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,subject chat XMPP title updated updated xmpp u...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,IConnectContext Message IConnection SOContaine...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,ExceptionInInitializerError eclipse eclipse ge...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,deserialize handleAsynchEvent processAsynch Bi...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,Shared createObject ECF launching Group Win Cr...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


In [22]:
merged_df.shape

(2332, 6)

## Save the merged dataframe as a json file

In [23]:
# convert each row of the dataframe to a json object and save it in a json file


In [24]:
import json


In [25]:
# convert the dataframe to a list of dictionaries
merged_df_dict = merged_df.to_dict('records')

In [33]:
# save the list of dictionaries as a json file
with open('Data/All_Data.json', 'w') as f:
    json.dump(merged_df_dict, f)

#### read the json file and convert it to a dataframe

In [34]:
import json

file_path = 'Data/All_Data.json'

try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

#### convert the list of dictionaries to a dataframe

In [35]:
# convert the list of dictionaries to a dataframe
df = pd.DataFrame.from_dict(data)

In [36]:
df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,subject chat XMPP title updated updated xmpp u...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,IConnectContext Message IConnection SOContaine...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,ExceptionInInitializerError eclipse eclipse ge...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,deserialize handleAsynchEvent processAsynch Bi...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,Shared createObject ECF launching Group Win Cr...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...
