## 1. Preprocess the dataset: preprocess_dataset.py

In [6]:
import pandas as pd

# Load the CSV file
data = pd.read_csv("file.csv")

# Extract relevant columns
bug_data = data[["LongName", "Number of Bugs"]]

# Handle missing values if needed
bug_data = bug_data.dropna()

# Normalize the data if needed
# ...

# Print the preprocessed dataset
print(bug_data.head())

                                            LongName  Number of Bugs
0  tool/src/org/antlr/v4/semantics/BasicSemanticC...               1
1  tool/src/org/antlr/v4/analysis/LeftRecursiveRu...               1
2  tool/src/org/antlr/v4/misc/EscapeSequenceParsi...               0
3    tool/test/org/antlr/v4/test/TestParserExec.java               2
4  runtime/Java/src/org/antlr/v4/runtime/Parser.java               0


## 2. Feature engineering: feature_engineering.py

In [9]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the 'LongName' column
bug_data['Tokens'] = bug_data['LongName'].apply(word_tokenize)

# Extract additional features if needed
# ...

# Print the updated dataset
print(bug_data.head())

                                            LongName  Number of Bugs  \
0  tool/src/org/antlr/v4/semantics/BasicSemanticC...               1   
1  tool/src/org/antlr/v4/analysis/LeftRecursiveRu...               1   
2  tool/src/org/antlr/v4/misc/EscapeSequenceParsi...               0   
3    tool/test/org/antlr/v4/test/TestParserExec.java               2   
4  runtime/Java/src/org/antlr/v4/runtime/Parser.java               0   

                                              Tokens  
0  [tool/src/org/antlr/v4/semantics/BasicSemantic...  
1  [tool/src/org/antlr/v4/analysis/LeftRecursiveR...  
2  [tool/src/org/antlr/v4/misc/EscapeSequencePars...  
3  [tool/test/org/antlr/v4/test/TestParserExec.java]  
4  [runtime/Java/src/org/antlr/v4/runtime/Parser....  


## 3. Prepare the training data: prepare_training_data.py


In [10]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(bug_data, test_size=0.2, random_state=42)

# Prepare input and output data
train_input = train_data['Tokens'].tolist()
train_output = train_data['Number of Bugs'].tolist()

val_input = val_data['Tokens'].tolist()
val_output = val_data['Number of Bugs'].tolist()

# Print the training and validation data sizes
print("Training data size:", len(train_input))
print("Validation data size:", len(val_input))

Training data size: 56070
Validation data size: 14018


## 4. Train the NLP model: train_model.py

In [12]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer


# Tokenize the input sequences and pad them to have the same length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_input)

train_sequences = tokenizer.texts_to_sequences(train_input)
val_sequences = tokenizer.texts_to_sequences(val_input)

max_seq_length = max(len(seq) for seq in train_sequences)
train_input = pad_sequences(train_sequences, maxlen=max_seq_length)
val_input = pad_sequences(val_sequences, maxlen=max_seq_length)

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_seq_length))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='linear'))

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(train_input, train_output, validation_data=(val_input, val_output), epochs=10, batch_size=32)

2023-06-13 01:11:15.351206: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-13 01:11:15.353968: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

## 5. Evaluate the model: evaluate_model.py


In [None]:
# Evaluate the model on the validation set
loss = model.evaluate(val_input, val_output)
print("Validation loss:", loss)

## 6. Generate bug reports: generate_bug_reports.py


In [13]:
# Provide a new input for bug report generation
new_input = ["Example input for bug report generation"]

# Preprocess the new input
new_input_sequence = tokenizer.texts_to_sequences(new_input)
new_input_padded = pad_sequences(new_input_sequence, maxlen=max_seq_length)

# Use the trained model to generate bug reports
predicted_output = model.predict(new_input_padded)

# Print the predicted bug reports
print("Predicted bug reports:", predicted_output)

Predicted bug reports: [[0.00206707]]
