In [1]:
import json
import pandas as pd

In [6]:
# load json dictionary from file
file_path = '../../../Data/Augmented/Cleaned_newLine_Data.json'
try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

In [8]:
# convert dictionary to dataframe
dataset = pd.DataFrame.from_dict(data)

In [9]:
dataset.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,[updated xmpp xmpp chat updated updated room x...,[XMPP] Room subject does not get updated in x...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,Standalone ClientApplication is breaks in lin...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,deserializeSharedObjectMessage with custom Cl...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"The ""send file"" functionality fails and launc...",>>> Environment: WinXP + Java ..0_06 + Eclipse...


## Now check query effectiveness and add to a new column effective queries

In [11]:
# create new empty column of list type for effective queries
dataset['effective_queries'] = [[] for _ in range(len(dataset))]

In [12]:
dataset.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description,effective_queries
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,[updated xmpp xmpp chat updated updated room x...,[XMPP] Room subject does not get updated in x...,When updated remotely by xmpp server title of ...,[]
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,ECF Generic provider thread interlock,We see the following problem while running an ...,[]
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,Standalone ClientApplication is breaks in lin...,The standalone org.eclipse.ecf.provider.app.Cl...,[]
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,deserializeSharedObjectMessage with custom Cl...,when sending a instance of a custom Class in a...,[]
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"The ""send file"" functionality fails and launc...",>>> Environment: WinXP + Java ..0_06 + Eclipse...,[]


In [16]:
from IR.Searcher.Index_Searcher import Index_Searcher

# to check effective query, we need to search with the each reformed_query and if the result is within the top 10 results, then it is effective
searcher = Index_Searcher()

In [31]:
count_effective = 0
count_each_row_effective = 0

# iterate over each row and check if query is effective
for index, row in dataset.iterrows():
    queries = row['reformed_query']
    ground_truths = row['ground_truth'] # ground truth answer(s). it's a list

    row_effective = False

    effective_queries = []
    for query in queries:
        # print(f'query: {query}')
        # print(type(query))
        # search with the query
        results = searcher.search(query, top_K_results=10)
        # check if the query is effective. it is effective if the ground truth answer(s) is within the top 10 results
        for ground_truth in ground_truths:
            if ground_truth in results:
                # print('Ground truths')
                # print(ground_truth)
                #
                # print('Results:')
                # print(results)

                # if effective, add the query to the effective_queries column
                effective_queries.append(query)

                count_effective += 1

                if(row_effective == False):
                    count_each_row_effective += 1
                    row_effective = True

    # add the effective queries to the effective_queries column
    dataset.at[index, 'effective_queries'] = effective_queries

    if(count_effective % 20 == 0):
        print(f'count_effective: {count_effective}')
        print(f'index: {index} - count_each_row_effective: {count_each_row_effective}\n')

# create a log file and print the results
with open('../../../Data/Augmented/effective_queries_log_top_10.txt', 'a') as f:
    f.write(f'Total Dataset Size: {len(dataset)}\n')
    f.write(f'count_effective: {count_effective} - count_each_row_effective: {count_each_row_effective}\n\n')
    f.write(f'Data with effective queries: {count_effective}\n')
    f.write(f'Data with not effective queries: {len(dataset) - count_each_row_effective}\n\n')
    f.write(f'Mean effective query per data point: {count_effective / len(dataset)}\n')
    f.write(f'Percentage of effective queries in each row: {count_each_row_effective / len(dataset)}\n')
    f.write(f'Percentage of not effective queries in each row: {(len(dataset) - count_each_row_effective) / len(dataset)}\n')



count_effective: 0
index: 0 - count_each_row_effective: 0

count_effective: 20
index: 4 - count_each_row_effective: 4

count_effective: 20
index: 5 - count_each_row_effective: 4

count_effective: 60
index: 17 - count_each_row_effective: 12

count_effective: 80
index: 28 - count_each_row_effective: 17

count_effective: 80
index: 29 - count_each_row_effective: 17

count_effective: 80
index: 30 - count_each_row_effective: 17

count_effective: 80
index: 31 - count_each_row_effective: 17

count_effective: 120
index: 38 - count_each_row_effective: 24

count_effective: 260
index: 79 - count_each_row_effective: 54

count_effective: 320
index: 92 - count_each_row_effective: 65

count_effective: 440
index: 122 - count_each_row_effective: 89

count_effective: 580
index: 145 - count_each_row_effective: 109

count_effective: 600
index: 149 - count_each_row_effective: 113

count_effective: 620
index: 153 - count_each_row_effective: 117

count_effective: 640
index: 157 - count_each_row_effective: 121

In [32]:
dataset.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description,effective_queries
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,[updated xmpp xmpp chat updated updated room x...,[XMPP] Room subject does not get updated in x...,When updated remotely by xmpp server title of ...,[]
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,ECF Generic provider thread interlock,We see the following problem while running an ...,[IConnectContext Message IConnection SOContain...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,Standalone ClientApplication is breaks in lin...,The standalone org.eclipse.ecf.provider.app.Cl...,[Application Container Standalone Factory Cont...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,deserializeSharedObjectMessage with custom Cl...,when sending a instance of a custom Class in a...,[handleAsynchEvent bins handleSharedObjectMess...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"The ""send file"" functionality fails and launc...",>>> Environment: WinXP + Java ..0_06 + Eclipse...,[group Group Thread SOManager crash load share...


In [33]:
dataset.tail()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description,effective_queries
2315,59908,[java/org/apache/tomcat/websocket/FutureToSend...,tomcat70,[Unit Close TimeUnit websoket TimeoutException...,Tomcat reports empty(null) close reason if se...,If Tomcat tryes to send large enough message t...,[Unit Close TimeUnit websoket TimeoutException...
2316,59923,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[Queue fair problems problems fairQueue invali...,The default value of validationInterval attri...,The validationInterval attribute is seconds b...,[Queue fair problems problems fairQueue invali...
2317,60008,[java/org/apache/catalina/filters/CorsFilter.j...,tomcat70,[CORS Content POST View server Origin access f...,Tomcat CORS filter not allowing origin with f...,The CORS filter not allowing request and retur...,[CORS Content POST View server Origin access f...
2318,60041,[java/org/apache/catalina/loader/LocalStrings....,tomcat70,[entry Failed lastJarAccessed Manifest getJarE...,NPE in WebappClassLoaderBase,"After deploy war in tomcat, delete the jar in ...",[entry Failed lastJarAccessed Manifest getJarE...
2319,60043,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[log care closing documentation suspectTimeout...,suspectTimeout does not work as expected with...,Already posted this to the mailinglist here: h...,[log care closing documentation suspectTimeout...


# save the dataset to a json file

In [34]:
# copy the dataframe to a new dataframe
dataset_copy = dataset.copy()

In [35]:
# drop the rows with empty effective_queries
dataset_copy = dataset_copy[dataset_copy['effective_queries'].map(lambda d: len(d)) > 0]

In [36]:
dataset_copy.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description,effective_queries
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,ECF Generic provider thread interlock,We see the following problem while running an ...,[IConnectContext Message IConnection SOContain...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,Standalone ClientApplication is breaks in lin...,The standalone org.eclipse.ecf.provider.app.Cl...,[Application Container Standalone Factory Cont...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,deserializeSharedObjectMessage with custom Cl...,when sending a instance of a custom Class in a...,[handleAsynchEvent bins handleSharedObjectMess...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"The ""send file"" functionality fails and launc...",>>> Environment: WinXP + Java ..0_06 + Eclipse...,[group Group Thread SOManager crash load share...
6,168382,[framework/bundles/org.eclipse.ecf.remoteservi...,ecf,[runIt executor Actual Render Executor Prepare...,[Regression] Exception is thrown out when pre...,Description: Exception is thrown out when prev...,[prepare report Tcp executor StandardContext A...


In [37]:
len(dataset_copy)

1738

# save the both datasets to a json file

In [39]:
# save the dataset to a json file
dataset_copy.to_json('../../../Data/Augmented/cleaned_effective.json', orient='records')
dataset.to_json('../../../Data/Augmented/Reference_cleaned_effective_for_stat.json', orient='records')