In [1]:
import numpy
import pandas as pd
import nltk
import re
import json

In [2]:
file_path = '../../Data/Augmented/All_Data.json'

try:
    with open(file_path, 'r') as f:
        file_contents = f.read()
        # print(file_contents)  # Print the contents of the file

    data = json.loads(file_contents)
    # Process the JSON data here

except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)
except FileNotFoundError:
    print(f"File not found: '{file_path}'")
except Exception as e:
    print("Error:", e)

In [3]:
dataset_df = pd.DataFrame.from_dict(data)

In [4]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,[updated xmpp xmpp chat updated updated room x...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java 1.5.0_06 + Eclip...


In [6]:
dataset_df.tail()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2315,59908,[java/org/apache/tomcat/websocket/FutureToSend...,tomcat70,[Unit Close TimeUnit websoket TimeoutException...,– Tomcat reports empty(null) close reason if ...,If Tomcat tryes to send large enough message t...
2316,59923,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[Queue fair problems problems fairQueue invali...,– The default value of validationInterval att...,The validationInterval attribute is 30 seconds...
2317,60008,[java/org/apache/catalina/filters/CorsFilter.j...,tomcat70,[CORS Content POST View server Origin access f...,– Tomcat CORS filter not allowing origin with...,The CORS filter not allowing request and retur...
2318,60041,[java/org/apache/catalina/loader/LocalStrings....,tomcat70,[entry Failed lastJarAccessed Manifest getJarE...,– NPE in WebappClassLoaderBase,"After deploy war in tomcat, delete the jar in ..."
2319,60043,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[log care closing documentation suspectTimeout...,– suspectTimeout does not work as expected wi...,Already posted this to the mailinglist here:\n...


# Data Cleaning

In [7]:
# load stopwords from nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stops_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asifs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asifs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def replace_words(arr, old_words, new_word):
    return [new_word if word in old_words else word for word in arr]



In [9]:
# define a function to clean the text
# def clean_text(text, remove_stop_words=False):
#    # Replace newlines with spaces
#     text = text.replace('\n', ' ')
#
#     # Tokenize the text into words
#     words = word_tokenize(text)
#
#     if(remove_stop_words):
#         words_without_stopwords = [word for word in words if word.lower() not in stops_words]
#     else:
#         words_without_stopwords = words
#
#     old_words = ['``', "''"]
#     words_without_stopwords = replace_words(words_without_stopwords, old_words, '"')
#     # Remove words that are only numbers
#     words_cleaned = [word for word in words_without_stopwords if not word.isdigit()]
#
#     # Join the words back into a cleaned sentence by iterating;
#     # this also removes any extra spaces
#     cleaned_text = ' '.join(words_cleaned)
#
#     return cleaned_text

In [10]:
def remove_number_tokens(text):
    # Use regular expression to remove tokens made up of only numbers
    pattern = r'\b[0-9]+\b'  # Matches one or more digits (numbers) only
    return re.sub(pattern, '', text)

In [11]:
def clean_text(text, remove_stop_words=False):
   # Replace newlines with spaces
    cleaned_text = text.replace('\n', ' ')

    cleaned_text = remove_number_tokens(cleaned_text)
    # Tokenize the text into words
    # words = word_tokenize(text)
    #
    # if(remove_stop_words):
    #     words_without_stopwords = [word for word in words if word.lower() not in stops_words]
    # else:
    #     words_without_stopwords = words
    #
    # old_words = ['``', "''"]
    # words_without_stopwords = replace_words(words_without_stopwords, old_words, '"')
    # # Remove words that are only numbers
    # words_cleaned = [word for word in words_without_stopwords if not word.isdigit()]
    #
    # # Join the words back into a cleaned sentence by iterating;
    # # this also removes any extra spaces
    # cleaned_text = ' '.join(words_cleaned)

    return cleaned_text

In [12]:
# clean the text on the dataset_df dataframe column 'bug_description'
dataset_df['bug_description'] = dataset_df['bug_description'].apply(clean_text)

In [13]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,[updated xmpp xmpp chat updated updated room x...,– [XMPP] Room subject does not get updated in...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,– ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,– Standalone ClientApplication is breaks in l...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,– deserializeSharedObjectMessage with custom ...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"– The ""send file"" functionality fails and lau...",>>> Environment: WinXP + Java ..0_06 + Eclipse...


In [14]:
dataset_df.tail()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2315,59908,[java/org/apache/tomcat/websocket/FutureToSend...,tomcat70,[Unit Close TimeUnit websoket TimeoutException...,– Tomcat reports empty(null) close reason if ...,If Tomcat tryes to send large enough message t...
2316,59923,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[Queue fair problems problems fairQueue invali...,– The default value of validationInterval att...,The validationInterval attribute is seconds b...
2317,60008,[java/org/apache/catalina/filters/CorsFilter.j...,tomcat70,[CORS Content POST View server Origin access f...,– Tomcat CORS filter not allowing origin with...,The CORS filter not allowing request and retur...
2318,60041,[java/org/apache/catalina/loader/LocalStrings....,tomcat70,[entry Failed lastJarAccessed Manifest getJarE...,– NPE in WebappClassLoaderBase,"After deploy war in tomcat, delete the jar in ..."
2319,60043,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[log care closing documentation suspectTimeout...,– suspectTimeout does not work as expected wi...,Already posted this to the mailinglist here: h...


In [15]:
def remove_dash(text):
    if (text.startswith(' –')):
        return text[2:]
    if (text.startswith('–')):
        return text[1:]

In [16]:
# Replace dash at the starting of the 'bug_title' column
dataset_df['bug_title'] = dataset_df['bug_title'].apply(remove_dash)

In [17]:
dataset_df.head()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
0,112599,[providers/bundles/org.eclipse.ecf.provider.xm...,ecf,[updated xmpp xmpp chat updated updated room x...,[XMPP] Room subject does not get updated in x...,When updated remotely by xmpp server title of ...
1,125572,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[Suspended Context client Connection Container...,ECF Generic provider thread interlock,We see the following problem while running an ...
2,134483,[framework/bundles/org.eclipse.ecf/src/org/ecl...,ecf,[Application Container Standalone Factory Cont...,Standalone ClientApplication is breaks in lin...,The standalone org.eclipse.ecf.provider.app.Cl...
3,146622,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[handleAsynchEvent bins handleSharedObjectMess...,deserializeSharedObjectMessage with custom Cl...,when sending a instance of a custom Class in a...
4,147269,[framework/bundles/org.eclipse.ecf.provider/sr...,ecf,[group Group Thread SOManager crash load share...,"The ""send file"" functionality fails and launc...",>>> Environment: WinXP + Java ..0_06 + Eclipse...


In [18]:
dataset_df.tail()

Unnamed: 0,bug_id,ground_truth,repo,reformed_query,bug_title,bug_description
2315,59908,[java/org/apache/tomcat/websocket/FutureToSend...,tomcat70,[Unit Close TimeUnit websoket TimeoutException...,Tomcat reports empty(null) close reason if se...,If Tomcat tryes to send large enough message t...
2316,59923,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[Queue fair problems problems fairQueue invali...,The default value of validationInterval attri...,The validationInterval attribute is seconds b...
2317,60008,[java/org/apache/catalina/filters/CorsFilter.j...,tomcat70,[CORS Content POST View server Origin access f...,Tomcat CORS filter not allowing origin with f...,The CORS filter not allowing request and retur...
2318,60041,[java/org/apache/catalina/loader/LocalStrings....,tomcat70,[entry Failed lastJarAccessed Manifest getJarE...,NPE in WebappClassLoaderBase,"After deploy war in tomcat, delete the jar in ..."
2319,60043,[modules/jdbc-pool/src/main/java/org/apache/to...,tomcat70,[log care closing documentation suspectTimeout...,suspectTimeout does not work as expected with...,Already posted this to the mailinglist here: h...


In [19]:
type(dataset_df)

pandas.core.frame.DataFrame

In [20]:
copy_dataset_df = dataset_df.copy()

In [21]:
# convert the dataframe to a list of dictionaries
dataset_df = dataset_df.to_dict('records')



In [22]:
type(dataset_df)

list

In [23]:
# save the list of dictionaries as a json file
with open('../../Data/Augmented/Cleaned_newLine_Data.json', 'w') as f:
    json.dump(dataset_df, f)