# Setup and Preparation



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# data cleaning and prepration and test split
df_android = pd.read_csv('AndroidFields.csv')
mask_new = df_android['Status'] == 'New'
df_android = df_android[~mask_new]
df_android.reset_index(inplace = True)
# From Frank: we do need some of the features so don't drop them. For example, BugID and MergeID are used to locate duplicates and their original bugs.
df_android = df_android.drop(['crypto', 'general', 'java', 'networking', 'MasterID','Product','Component','Priority','PriorityNumber','Version','OpenDate','CloseDate','Stars','VersionNumber','Summary','index'],axis=1)
df_android['Label'] = df_android['Status']

df_eclipse = pd.read_csv('EclipseFields.csv')
df_eclipse.reset_index(inplace = True)
df_eclipse = df_eclipse.drop(['crypto', 'general', 'java', 'networking', 'MasterID','Priority','PriorityNumber','Version','OpenDate','CloseDate','Stars','VersionNumber','Summary','index'],axis=1)
df_eclipse['Label'] = df_eclipse['Status']

android_train, android_test = train_test_split(df_android, test_size=0.3, random_state=42)
eclipse_train, eclipse_test = train_test_split(df_eclipse, test_size=0.3, random_state=42) # From Frank: Added this for eclipse dataset

# android_train.dropna()
# # android_test.dropna()
# eclipse_train.dropna()
# eclipse_test.dropna()
# train.to_csv("bug_train.csv")
# test.to_csv("bug_test.csv")


# Preliminary Preprocessing

Including tasks like lowercasing, punctuation removal, stop word removal, repetitive word removal, etc.

Temporary block for some preprocessing

In [None]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Temporary preproccessing
# From Frank: Personally I think we should keep this. Bert Tokenizer doesn't really
#             do these things. It looks like BERT tokenizer will process the text
#             for it to only be better utilized by BERT. So they are doing different jobs.
def preprocessing_text(text):
  # Tokenization
  tokens = word_tokenize(text)

  # Lowercasing and punctuation removal
  tokens = [word.lower() for word in tokens if word.isalpha()]

  # Stop word removal
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  # Stemming (Word root extraction)
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(word) for word in tokens]

  # Remove repetitive words (not included in the original paper)
  tokens = list(set(tokens))

  return ' '.join(tokens) # put words back into a sentence


# Topic Modelling and Feature Selection

Perform Topic Modelling for all bug reports. First categorize all reports based on their status/resolutions, and then categorize them into 10 topics, and then do feature selection inside each topic separately.

At the end of this block, there are two dataframes that can be used for training/evaluating.





In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from scipy.stats import pearsonr, chi2_contingency
from sklearn.preprocessing import LabelEncoder

# Topic modelling should be performed on reports that have the same status/resolution.
# This method will categorize all reports based on their status/resolution.
# It returns a list of dfs, each of which has the same status/resolution.
def categorize_reports(df, status_list):
  categories = []
  for status in status_list:
    filtered_reports = df[(df['Status'].str.lower() == status) | ((df['Status'].str.lower() == 'duplicate') & (df['MergeID'].isin(df[df['Status'].str.lower() == status]['BugID'])))]
    new_df = pd.DataFrame(filtered_reports)
    categories.append(new_df)
  #add orphan reports
  filtered_reports = df[
    (df['Status'].str.lower() == 'duplicate') &
    ~(df['MergeID'].isin(df[df['Status'].str.lower() == status]['BugID']))
  ]
  new_df = pd.DataFrame(filtered_reports)
  categories.append(new_df)
  return categories

# Perform topic modelling for one category
# It returns ten dfs in a list in which the reports share the same topic
def topic_modelling(category_df):
  documents = category_df['all_textual_data'].tolist()
  # Convert text data to document-term matrix using CountVectorizer
  vectorizer = CountVectorizer()
  X = vectorizer.fit_transform(documents)
  # Create a gensim Dictionary
  gensim_dict = Dictionary([document.split() for document in documents])
  # Convert document-term matrix to gensim corpus
  corpus = [gensim_dict.doc2bow(document.split()) for document in documents]
  # The number of topics
  num_topics = 10

  # Perform topic modeling using LDA from gensim
  lda_model = LdaModel(corpus=corpus, id2word=gensim_dict, num_topics=num_topics, passes=1) # change passes for faster/better convergence

  # Display the topics and associated words
  # for idx, topic in lda_model.print_topics(-1):
  #     print(f"Topic {idx}: {topic}")

  # Get the topics for each document
  document_topics = []
  for i, doc in enumerate(corpus):
    topics = lda_model.get_document_topics(doc)
    dominant_topic = max(topics, key=lambda x: x[1])
    document_topics.append((i, dominant_topic[0]))

  # Create separate DataFrames for each topic
  topic_list = []
  for i in range(num_topics):
      report_index_list = [doc_id for doc_id, topic_id in document_topics if topic_id == i]
      topic_df = category_df.iloc[report_index_list].copy()
      topic_list.append(topic_df)

  return topic_list

# The first part of feature selection
# Get a dictionary that has scores for all words.
# Words with a nan score will be removed.
def get_words_with_high_scores(sentences, labels):
    # Data preprocessing - Tokenize and create a word frequency table
    word_freq = {}
    for sentence, label in zip(sentences, labels):
        words = sentence.lower().split()  # Split sentence into words
        for word in words:
            word_freq[word] = word_freq.get(word, {'count': 0, 'label_count': {}})
            word_freq[word]['count'] += 1
            word_freq[word]['label_count'][label] = word_freq[word]['label_count'].get(label, 0) + 1

    # Calculate Pearson correlation and Chi-Square test for each word
    word_scores = {}
    for word, info in word_freq.items():
        presence = np.zeros(len(sentences))
        for i, sentence in enumerate(sentences):
            if word in sentence.lower().split():
                presence[i] = 1

        # Pearson correlation coefficient
        # if len(presence) < 2 or len(labels) < 2:
        #   # Avoid an error here. Happens when the number of reports under a specific category's topic is lower than 2.
        #   pearson_corr = 0
        # else:
        #   pearson_corr, _ = pearsonr(presence, labels)

        # Chi-Square test
        contingency_table = [[info['label_count'].get(0, 0), info['label_count'].get(1, 0)],
                            [info['count'] - info['label_count'].get(0, 0), len(sentences) - info['label_count'].get(1, 0)]]

        # Check if all values in the contingency table are non-negative
        if all(val >= 0 for row in contingency_table for val in row):
            chi2, _, _, _ = chi2_contingency(contingency_table)
        else:
            chi2 = 0  # Assign a default value


        # Get the score
        # score = (pearson_corr + chi2) / 2
        score = chi2
        word_scores[word] = score

    # Filter dictionary to remove non-numeric scores (nan value)
    filtered_scores = {k: v for k, v in word_scores.items() if isinstance(v, (int, float)) and not np.isnan(v)}
    # Sort words by their scores
    sorted_word_scores = {k: v for k, v in sorted(filtered_scores.items(), key=lambda item: abs(item[1]), reverse=True)} # absolute value
    return sorted_word_scores


# Function to filter words based on scores
# It will remove the words that are not in the score dictionary (with nan score)
# And then it will keep 50% of the words that have the highest scores
# Returns a new sentence with selected words.
def filter_words_by_score(sentence, scores_dict):
    # Split the sentence into words
    words = sentence.split()
    # Filter out words not in the scores dictionary
    words = [word for word in words if word in scores_dict]
    # Get scores for words in the sentence and store them in a list
    word_scores_list = [scores_dict.get(word, 0) for word in words]
    # Sort words based on their scores
    sorted_words = [word for _, word in sorted(zip(word_scores_list, words), reverse=True)[:len(words)//2]]
    return ' '.join(sorted_words)


# This method will perform topic modelling and feature selection
# It returns a new df for training/fine-tuning
# df is the pre-processed dataset, dataset is 'android' or 'eclipse'
def topic_modelling_and_feature_selection(df, dataset):
  final_df_list = []
  if dataset.lower() == 'android':
    categories = categorize_reports(df, ["assigned", "declined", "futurerelease", "needsinfo", "question", "released", "reviewed", "spam", "unassigned", "unreproducible", "workingasintended"])
  else:
    categories = categorize_reports(df, ["fixed", "wontfix", "invalid", "worksforme", "not_eclipse"])


  for category in tqdm(categories, desc=f'Topic Modelling and Feature Selection for {dataset}', unit='category'):
    topic_list = topic_modelling(category)
    for topic in topic_list:
      reports = topic['all_textual_data']
      labels = topic['Status'].tolist()
      # Initialize LabelEncoder
      label_encoder = LabelEncoder()
      # Fit and transform labels to binary numbers
      binary_labels = label_encoder.fit_transform(labels)
      # get score dictionray
      score_dictionary = get_words_with_high_scores(reports, binary_labels)
      # Filter report based on scores, keep 50% of all words
      # only if there are more than two words in the report
      for index, row in topic.iterrows():
        filtered_words = filter_words_by_score(topic.at[index, 'all_textual_data'], score_dictionary)
        if len(filtered_words.split()) > 2:
          topic.at[index, 'all_textual_data'] = filtered_words
      final_df_list.append(topic)

  combined_df = pd.concat(final_df_list, ignore_index=True)
  return combined_df

In [None]:
import pandas as pd
import regex as re  # Using 'regex' module for pattern matching
from imblearn.over_sampling import ADASYN
from sklearn.feature_extraction.text import TfidfVectorizer

# Method for changing non-duplicate reports' labels to nondup
def label_duplicates(status):
    if status.lower() == 'duplicate':
        return 'duplicate'
    else:
        return 'nondup'

# Function to check if text is in English
def is_english(text):
    if isinstance(text, str):  # Check if 'text' is a string
        non_english_pattern = re.compile(r'[^\x00-\x7F]+')  # Pattern to match non-English characters
        return not bool(non_english_pattern.search(text))
    return False  # If 'text' is not a string, return False

# remove rows that is not english
def remove_non_english_rows(df, text_column):

  df.dropna()  # Drops rows containing any NaN values

  # Filter out rows with non-string values or empty strings in text_column
  df = df[df[text_column].apply(lambda x: isinstance(x, str) and x.strip() != '')]

  # Apply the language detection function to the specified text column
  df['is_english'] = df[text_column].apply(lambda x: is_english(x))

  # Filter the DataFrame to keep only rows with English text
  df = df[df['is_english']]

  # Drop the temporary 'is_english' column
  df.drop(columns=['is_english'], inplace=True)


  # Reset the index
  df.reset_index(drop=True, inplace=True)

  return df

# Change the column names and the labels
def change_column_names_and_labels(df):
    # Renaming columns
    df.rename(columns={'all_textual_data': 'text', 'Status': 'label'}, inplace=True)

    # Mapping values in the 'label' column
    label_mapping = {'duplicate': 1, 'nondup': 0}  # Define mapping
    df['label'] = df['label'].map(label_mapping).fillna(df['label'])  # Map values and fill NaNs with original values

    return df

# Oversampling
def oversample_textual_data(df, text_column, label_column):

  if True:
    return df

  # Extract text and labels from the DataFrame
  X = df[text_column]
  y = df[label_column]

  # Convert text data to numerical representations (TF-IDF in this example)
  vectorizer = TfidfVectorizer()  # Use TF-IDF or other text vectorization methods as needed
  X_tfidf = vectorizer.fit_transform(X)

  # Apply ADASYN to oversample the minority class
  adasyn = ADASYN(random_state=42)
  X_resampled, y_resampled = adasyn.fit_resample(X_tfidf, y)

  # Convert oversampled data back to a DataFrame
  oversampled_df = pd.DataFrame(X_resampled, columns=vectorizer.get_feature_names())
  oversampled_df[label_column] = y_resampled

  return oversampled_df


In [None]:
"""
  Android dataset (train and test set) ready for use
  (to add additional data, drop extra columns, then they can be used
   for normal BERT model training)
   From Frank: P.S. Already exported two csv files reaady to use. Ask me (Frank) to share them if you beat me to it
"""
# train
df_train = android_train
df_train['all_textual_data'] = df_train['Title'].astype(str) + df_train['Description'].astype(str)
df_train['all_textual_data'] = df_train['all_textual_data'].apply(preprocessing_text)
processed_df_train_android = topic_modelling_and_feature_selection(df_train, 'android')
# Apply the function to update the 'status' column
processed_df_train_android['Status'] = processed_df_train_android['Status'].apply(lambda x: label_duplicates(x))
# test
df_test = android_test
df_test['all_textual_data'] = df_test['Title'].astype(str) + df_test['Description'].astype(str)
df_test['all_textual_data'] = df_test['all_textual_data'].apply(preprocessing_text)
processed_df_test_android = topic_modelling_and_feature_selection(df_test, 'android')
# Apply the function to update the 'status' column
processed_df_test_android['Status'] = processed_df_test_android['Status'].apply(lambda x: label_duplicates(x))

android_train_processed = processed_df_train_android[['all_textual_data', 'Status']].copy()
android_test_processed = processed_df_test_android[['all_textual_data', 'Status']].copy()

android_train_processed = remove_non_english_rows(android_train_processed, 'all_textual_data')
android_test_processed = remove_non_english_rows(android_test_processed, 'all_textual_data')
c = change_column_names_and_labels(android_train_processed)
android_test_processed = change_column_names_and_labels(android_test_processed)

#oversample training set
android_train_processed = oversample_textual_data(android_train_processed, 'text', 'label')

android_train_processed.to_csv("android_train.csv")
android_test_processed.to_csv("android_test.csv")

In [None]:
"""
  From Frank: Timed out when running this block. Will try it tomorrow

  Eclipse dataset (train and test set) ready for use
  (to add additional data, drop extra columns, then they can be used
   for normal BERT model training)
"""
# Eclipse Dataset
# train
df_train_eclipse = eclipse_train
df_train_eclipse['all_textual_data'] = df_train_eclipse['Title'].astype(str) + df_train_eclipse['Description'].astype(str)
df_train_eclipse['all_textual_data'] = df_train_eclipse['all_textual_data'].apply(preprocessing_text)
processed_df_train_eclipse = topic_modelling_and_feature_selection(df_train_eclipse, 'eclipse')
# Apply the function to update the 'status' column
processed_df_train_eclipse['Status'] = processed_df_train_eclipse['Status'].apply(lambda x: label_duplicates(x))
eclipse_train = processed_df_train_eclipse[['all_textual_data', 'Status']].copy()

eclipse_train = remove_non_english_rows(eclipse_train, 'all_textual_data')
eclipse_train = change_column_names_and_labels(eclipse_train)
#oversample training set
eclipse_train = oversample_textual_data(eclipse_train, 'text', 'label')
eclipse_train.to_csv("eclipse_train.csv")
# test
df_test_eclipse = eclipse_test
df_test_eclipse['all_textual_data'] = df_test_eclipse['Title'].astype(str) + df_test_eclipse['Description'].astype(str)
df_test_eclipse['all_textual_data'] = df_test_eclipse['all_textual_data'].apply(preprocessing_text)
processed_df_test_eclipse = topic_modelling_and_feature_selection(df_test_eclipse, 'eclipse')
# Apply the function to update the 'status' column
processed_df_test_eclipse['Status'] = processed_df_test_eclipse['Status'].apply(lambda x: label_duplicates(x))
eclipse_test = processed_df_test_eclipse[['all_textual_data', 'Status']].copy()

eclipse_test = remove_non_english_rows(eclipse_test, 'all_textual_data')
eclipse_test = change_column_names_and_labels(eclipse_test)
eclipse_test.to_csv("eclipse_test.csv")

# Adding Additional Data

Adding other features into the textual data (including numerical data if possible)

And prepare the final dataframes ready to use

In [None]:
# TODO: add the code
# TODO: If we're adding oversampling, we can do it here.

# For Android
features_to_add = ['Type']
for feature_to_add in features_to_add:
  processed_df_train_android['all_textual_data'] = processed_df_train_android['all_textual_data'] + " " + processed_df_train_android[feature_to_add].astype(str)
  processed_df_test_android['all_textual_data'] = processed_df_test_android['all_textual_data'] + " " + processed_df_test_android[feature_to_add].astype(str)


# Only get the textual data + labels and save them as the final dataframes to use for training
android_train_additional_data = processed_df_train_android[['all_textual_data', 'Status']].copy()
android_test_additional_data = processed_df_test_android[['all_textual_data', 'Status']].copy()

# Apply the function to update the 'status' column
android_train_additional_data['Status'] = android_train_additional_data['Status'].apply(lambda x: label_duplicates(x))
android_test_additional_data['Status'] = android_test_additional_data['Status'].apply(lambda x: label_duplicates(x))

android_train_additional_data = remove_non_english_rows(android_train_additional_data, 'all_textual_data')
android_test_additional_data = remove_non_english_rows(android_test_additional_data, 'all_textual_data')

android_train_additional_data = change_column_names_and_labels(android_train_additional_data)
android_test_additional_data = change_column_names_and_labels(android_test_additional_data)

#oversample training set
android_train_additional_data = oversample_textual_data(android_train_additional_data, 'text', 'label')

android_train_additional_data.to_csv("android_train_additional_data.csv")
android_test_additional_data.to_csv("android_test_additional_data.csv")


In [None]:
# For Eclipse
features_to_add = ['Product', 'Component', 'Type']
for feature_to_add in features_to_add:
  processed_df_train_eclipse['all_textual_data'] = processed_df_train_eclipse['all_textual_data'] + " " + processed_df_train_eclipse[feature_to_add].astype(str)
  processed_df_test_eclipse['all_textual_data'] = processed_df_test_eclipse['all_textual_data'] + " " + processed_df_test_eclipse[feature_to_add].astype(str)

eclipse_train_additional_data = processed_df_train_eclipse[['all_textual_data', 'Status']].copy()
eclipse_test_additional_data = processed_df_test_eclipse[['all_textual_data', 'Status']].copy()

# Apply the function to update the 'status' column
eclipse_train_additional_data['Status'] = eclipse_train_additional_data['Status'].apply(lambda x: label_duplicates(x))
eclipse_test_additional_data['Status'] = eclipse_test_additional_data['Status'].apply(lambda x: label_duplicates(x))

eclipse_train_additional_data = remove_non_english_rows(eclipse_train_additional_data, 'all_textual_data')
eclipse_test_additional_data = remove_non_english_rows(eclipse_test_additional_data, 'all_textual_data')

eclipse_train_additional_data = change_column_names_and_labels(eclipse_train_additional_data)
eclipse_test_additional_data = change_column_names_and_labels(eclipse_test_additional_data)

#oversample training set
eclipse_train_additional_data = oversample_textual_data(eclipse_train_additional_data, 'text', 'label')

eclipse_train_additional_data.to_csv("eclipse_train_additional_data.csv")
eclipse_test_additional_data.to_csv("eclipse_test_additional_data.csv")