Install NLTK

In [None]:
!pip install nltk

Take Datasets

positive_samples.csv with
Project_id,BIC_Hash,BIC_Message,BIC_Author,BR_Title,BR_Description,BFX_Hash,BFX_Message,BFX_Author


and Create

bug_reports.csv  that contains columns:
bug_report, related_commit_message, commit_hash

all_commits.csv that contains columns:
commit_message, commit_hash

Generation of Negative Pairs and Labeled Data

In [12]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split

# Load positive_samples.csv
positive_samples = pd.read_csv('positive_samples.csv')

# Splitting data for each project
unique_projects = positive_samples['Project_id'].unique()
train_frames = []
test_frames = []
all_commits = pd.DataFrame()

for filename in glob.glob('Project All Commits CSVs/*.csv'):
    project_df = pd.read_csv(filename, usecols=['Commit Hash', 'Commit Message'])
    all_commits = pd.concat([all_commits, project_df])

for project_id in unique_projects:
    project_data = positive_samples[positive_samples['Project_id'] == project_id]
    train, test = train_test_split(project_data, test_size=0.2, random_state=42)
    train_frames.append(train)
    test_frames.append(test)

# Combine split data into training and testing datasets
train_data = pd.concat(train_frames)
test_data = pd.concat(test_frames)

# Create bug_reports_train.csv and bug_reports_test.csv
bug_reports_train = train_data[['BR_Title', 'BR_Description', 'BIC_Message', 'BIC_Hash']]
bug_reports_train['bug_report'] = bug_reports_train['BR_Title'] + ' ' + bug_reports_train['BR_Description']
bug_reports_train = bug_reports_train[['bug_report', 'BIC_Message', 'BIC_Hash']]
bug_reports_train.columns = ['bug_report', 'related_commit_message', 'commit_hash']
bug_reports_train.to_csv('bug_reports_train.csv', index=False)

bug_reports_test = test_data[['BR_Title', 'BR_Description', 'BIC_Message', 'BIC_Hash', 'BFX_Author']]
bug_reports_test['bug_report'] = bug_reports_test['BR_Title'] + ' ' + bug_reports_test['BR_Description']
bug_reports_test = bug_reports_test[['bug_report', 'BIC_Message', 'BIC_Hash', 'BFX_Author']]
bug_reports_test.columns = ['bug_report', 'related_commit_message', 'bug_inducing_commit_hash', 'bug_fixing_developer']
bug_reports_test.to_csv('bug_reports_test.csv', index=False)

# Filter all commits for training and testing
all_commits = all_commits.drop_duplicates(subset=['Commit Hash'])
all_commits.columns = ['commit_hash', 'commit_message']
training_commit_hashes = set(train_data['BIC_Hash'])
testing_commit_hashes = set(test_data['BIC_Hash'])

# Ensure no overlap in commit hashes between training and testing
all_commits_train = all_commits[~all_commits['commit_hash'].isin(testing_commit_hashes)]
all_commits_test = all_commits[all_commits['commit_hash'].isin(testing_commit_hashes)]

# Save all_commits_train.csv and all_commits_test.csv
all_commits_train.to_csv('all_commits_train.csv', index=False)
all_commits_test.to_csv('all_commits_test.csv', index=False)

# Reporting the ratio
print(f"Training data ratio (bug reports): {len(bug_reports_train) / len(positive_samples):.2f}")
print(f"Testing data ratio (bug reports): {len(bug_reports_test) / len(positive_samples):.2f}")
print(f"Training data ratio (all commits): {len(all_commits_train) / len(all_commits):.2f}")
print(f"Testing data ratio (all commits): {len(all_commits_test) / len(all_commits):.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bug_reports_train['bug_report'] = bug_reports_train['BR_Title'] + ' ' + bug_reports_train['BR_Description']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bug_reports_test['bug_report'] = bug_reports_test['BR_Title'] + ' ' + bug_reports_test['BR_Description']


Training data ratio (bug reports): 0.80
Testing data ratio (bug reports): 0.20
Training data ratio (all commits): 0.96
Testing data ratio (all commits): 0.04


In [13]:
import pandas as pd

# Load your datasets
bug_reports_df = pd.read_csv('bug_reports_train.csv')  # Contains columns: bug_report, related_commit_message, commit_hash
all_commits_df = pd.read_csv('all_commits_train.csv')  # Contains columns: commit_message, commit_hash

# Creating Positive Pairs
positive_pairs = bug_reports_df[['bug_report', 'related_commit_message', 'commit_hash']]
positive_pairs['is_related'] = 1

# Creating Negative Pairs
negative_pairs_list = []

for _, row in bug_reports_df.iterrows():
    bug_report = row['bug_report']
    related_commit_hash = row['commit_hash']

    # Randomly select commit messages which are not related
    unrelated_commits = all_commits_df[all_commits_df['commit_hash'] != related_commit_hash].sample(n=1)

    for _, unrelated_row in unrelated_commits.iterrows():
        negative_pairs_list.append([bug_report, unrelated_row['commit_message'], unrelated_row['commit_hash'], 0])

negative_pairs = pd.DataFrame(negative_pairs_list, columns=['bug_report', 'commit_message', 'commit_hash', 'is_related'])

# Combining Positive and Negative Pairs
combined_df = pd.concat([positive_pairs, negative_pairs]).reset_index(drop=True)

# Shuffle the dataset
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Save to CSV
combined_df.to_csv('combined_pairs_with_hashes.csv', index=False)


Preprocessing Functions

In [14]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special chars and numbers
    text = re.sub(r'\W+|\d+', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and stemming
    filtered_words = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return " ".join(filtered_words)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yahyaelnouby/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yahyaelnouby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Siamese Network

In [17]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Model, optimizers

# Read your combined dataset with commit hashes
combined_df = pd.read_csv('combined_pairs_with_hashes.csv')


combined_df['bug_report'] = combined_df['bug_report'].apply(preprocess_text)
combined_df['commit_message'] = combined_df['commit_message'].apply(preprocess_text)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([combined_df['bug_report'], combined_df['commit_message']]))

max_len = 295  # Adjust based on your data
seq_bug_report = pad_sequences(tokenizer.texts_to_sequences(combined_df['bug_report']), maxlen=max_len)
seq_commit_message = pad_sequences(tokenizer.texts_to_sequences(combined_df['commit_message']), maxlen=max_len)

labels = combined_df['is_related'].values

# Siamese Network Architecture
def create_model():
    input = layers.Input(shape=(max_len,))
    x = layers.Embedding(len(tokenizer.word_index) + 1, 128)(input)
    x = layers.LSTM(64)(x)
    return Model(input, x)

bug_report_model = create_model()
commit_message_model = create_model()

input_bug_report = layers.Input(shape=(max_len,))
input_commit_message = layers.Input(shape=(max_len,))

encoded_bug_report = bug_report_model(input_bug_report)
encoded_commit_message = commit_message_model(input_commit_message)

distance = layers.Lambda(lambda x: tf.norm(x[0] - x[1], axis=1))([encoded_bug_report, encoded_commit_message])
model = Model([input_bug_report, input_commit_message], distance)

# Contrastive Loss Function
def contrastive_loss(y_true, y_pred):
    margin = 1
    y_true = tf.cast(y_true, tf.float32)
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

# Compile model
model.compile(optimizer=optimizers.Adam(), loss=contrastive_loss)

# Train model
model.fit([seq_bug_report, seq_commit_message], labels, epochs=10, batch_size=32)

def find_related_commit(all_commits_csv, bug_report_text, k=5):
    # Load all commits
    all_commits_df = pd.read_csv(all_commits_csv)

    # Preprocess the bug report text
    processed_bug_report = preprocess_text(bug_report_text)
    bug_seq = pad_sequences(tokenizer.texts_to_sequences([processed_bug_report]), maxlen=max_len)

    # Preprocess and pad all commit messages
    all_commits_df['commit_message'] = all_commits_df['commit_message'].apply(preprocess_text)
    all_commit_seqs = pad_sequences(tokenizer.texts_to_sequences(all_commits_df['commit_message']), maxlen=max_len)

    # Calculate similarities
    similarities = model.predict([np.tile(bug_seq, (len(all_commit_seqs), 1)), all_commit_seqs])

    # Find top 'k' similar commit hashes and messages
    top_k_indices = np.argsort(similarities, axis=0)[:k].flatten()
    top_k_hashes = all_commits_df.iloc[top_k_indices]['commit_hash'].values
    top_k_messages = all_commits_df.iloc[top_k_indices]['commit_message'].values

    return list(zip(top_k_hashes, top_k_messages))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
model.save("siamese.h5")

  saving_api.save_model(


In [None]:
import csv
with open("bug_reports_test.csv", 'r') as file:
    # Create a CSV reader
    csv_reader = csv.DictReader(file)

    
    # Iterate through each row in the CSV file
    k = 0
    recall_at_10 = 0
    for row in csv_reader:
      bug_report = row["bug_report"]
      top_k_commits = find_related_commit('all_commits_test.csv', bug_report, k=10)
      print("doğru: ", row["related_commit_message"])
      for commit_hash, commit_message in top_k_commits:
            i=0
            print("found: ", commit_message)
            if row["bug_inducing_commit_hash"] == commit_hash:
                print("found bic")
                i=1
            recall_at_10+=i
    k+=1
            
    recall_at_10 = recall_at_10/k
    print(recall_at_10)