In [1]:
import re
from typing import List, Dict
from collections import Counter
import spacy
import nltk
from nltk.corpus import stopwords

# Ensure necessary resources are available
nltk.download('stopwords', quiet=True)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.vocab: List[str] = []
        self.word_to_idx: Dict[str, int] = {}
        self.idx_to_word: Dict[int, str] = {}

    def clean_text(self, text: str) -> str:
        text = text.lower().strip()
        text = re.sub(r'[^\w\s]', '', text)
        return text

    def lemmatize(self, text: str) -> List[str]:
        doc = nlp(text)
        return [token.lemma_ for token in doc if token.text not in self.stop_words]

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenizes and lemmatizes the cleaned text into words.
        """
        text = self.clean_text(text)
        return self.lemmatize(text)

    def build_vocabulary(self, texts: List[str]):
        """
        Builds vocabulary from a list of input texts using frequency-based ordering.
        """
        word_freq = Counter()
        for text in texts:
            word_freq.update(self.tokenize(text))

        self.vocab = [word for word, _ in word_freq.most_common()]
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}

In [2]:
import math
import numpy as np
from typing import List, Dict, Tuple
from collections import Counter

class TFIDFVectorizer:
    def __init__(self, preprocessor: TextPreprocessor):
        """
        Initializes the TFIDFVectorizer with an external TextPreprocessor instance.
        """
        self.idf: Dict[str, float] = {}
        self.preprocessor = preprocessor

    def fit(self, texts: List[str]):
        """
        Computes the IDF values from a list of texts.
        """
        self.preprocessor.build_vocabulary(texts)
        doc_freq = Counter()

        for text in texts:
            doc_freq.update(set(self.preprocessor.tokenize(text)))

        num_docs = len(texts)
        self.idf = {word: math.log((num_docs + 1) / (freq + 1)) + 1
                    for word, freq in doc_freq.items()}  # Smoothing applied

    def transform(self, text: str) -> np.ndarray:
        """
        Converts a single text document into a TF-IDF vector.
        """
        tokens = self.preprocessor.tokenize(text)
        word_counts = Counter(tokens)
        vocab_size = len(self.preprocessor.vocab)

        vector = np.zeros(vocab_size)

        for word, count in word_counts.items():
            idx = self.preprocessor.word_to_idx.get(word)
            if idx is not None:
                tf = count / len(tokens)
                vector[idx] = tf * self.idf.get(word, 0)

        return vector

    def get_vocab_size(self) -> int:
        """
        Returns the size of the vocabulary.
        """
        return len(self.preprocessor.vocab)


class NeuralNetwork:
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        """
        Initializes the neural network using Xavier initialization.
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        # Xavier/Glorot initialization
        self.hidden_weights = np.random.randn(hidden_size, input_size) * np.sqrt(2 / input_size)
        self.output_weights = np.random.randn(output_size, hidden_size) * np.sqrt(2 / hidden_size)
        self.hidden_bias = np.zeros(hidden_size)
        self.output_bias = np.zeros(output_size)

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))  # Prevent overflow

    def sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
        return x * (1 - x)

    def forward(self, inputs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        hidden_sum = np.dot(self.hidden_weights, inputs) + self.hidden_bias
        hidden_outputs = self.sigmoid(hidden_sum)
        output_sum = np.dot(self.output_weights, hidden_outputs) + self.output_bias
        final_outputs = self.sigmoid(output_sum)
        return hidden_outputs, final_outputs

    def backward(self, inputs: np.ndarray, hidden_outputs: np.ndarray,
                 final_outputs: np.ndarray, targets: np.ndarray, learning_rate: float = 0.1):
        output_errors = targets - final_outputs
        output_deltas = output_errors * self.sigmoid_derivative(final_outputs)
        hidden_errors = np.dot(self.output_weights.T, output_deltas)
        hidden_deltas = hidden_errors * self.sigmoid_derivative(hidden_outputs)

        self.output_weights += learning_rate * np.outer(output_deltas, hidden_outputs)
        self.output_bias += learning_rate * output_deltas
        self.hidden_weights += learning_rate * np.outer(hidden_deltas, inputs)
        self.hidden_bias += learning_rate * hidden_deltas


class BugResolver:
    def __init__(self, preprocessor: TextPreprocessor):
        """
        Initializes the BugResolver with:
        - A TF-IDF vectorizer for text processing.
        - A neural network.
        - A mapping between numeric solution labels and actual solutions.
        """
        self.vectorizer = TFIDFVectorizer(preprocessor)
        self.network = None  # Neural network will be initialized in prepare_data
        self.solution_mapping: Dict[int, str] = {}  # Maps numeric indices to solution texts

    def prepare_data(self, bug_reports: List[Dict[str, str]]) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepares training data by converting bug descriptions into numerical vectors
        and mapping solutions to one-hot encoded labels.
        """
        descriptions = [report["description"] for report in bug_reports]
        solutions = [report["solution"] for report in bug_reports]

        self.vectorizer.fit(descriptions)

        if self.network is None:
            input_size = self.vectorizer.get_vocab_size()
            hidden_size = min(150, input_size // 2)
            num_solutions = len(set(solutions))
            self.network = NeuralNetwork(input_size, hidden_size, num_solutions)

        unique_solutions = list(set(solutions))
        self.solution_mapping = {i: sol for i, sol in enumerate(unique_solutions)}

        X = np.array([self.vectorizer.transform(desc) for desc in descriptions])
        y = np.zeros((len(solutions), len(unique_solutions)))

        for i, sol in enumerate(solutions):
            y[i, unique_solutions.index(sol)] = 1.0

        return X, y

    def train(self, bug_reports: List[Dict[str, str]], epochs: int = 100):
        """
        Trains the neural network using the provided bug reports.
        """
        X, y = self.prepare_data(bug_reports)

        for epoch in range(epochs):
            total_error = 0

            for i in range(len(X)):
                hidden_outputs, final_outputs = self.network.forward(X[i])
                self.network.backward(X[i], hidden_outputs, final_outputs, y[i])
                total_error += np.sum((y[i] - final_outputs) ** 2)

            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch + 1}/{epochs}, Error: {total_error/len(X):.4f}")

    def predict(self, bug_description: str) -> str:
        """
        Predicts the solution for a given bug description.
        """
        input_vector = self.vectorizer.transform(bug_description)
        _, outputs = self.network.forward(input_vector)
        predicted_idx = np.argmax(outputs)
        return self.solution_mapping[predicted_idx]

In [7]:
## Do not change anything from here


from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = "/content/drive/My Drive/base-nlp-model-dataset.xlsx"

def load_data_from_excel():
    """Loads data from an Excel file stored in Google Drive."""
    df = pd.read_excel(file_path, engine="openpyxl")

    # Check column names
    expected_columns = {"description", "input"}
    if not expected_columns.issubset(df.columns):
        raise ValueError(f"Expected columns {expected_columns}, but got {df.columns}")

    # Convert DataFrame to a list of dictionaries
    bug_reports = df[["description", "input"]].dropna().to_dict(orient="records")

    # Rename keys for consistency
    for report in bug_reports:
        report["description"] = report.pop("description")
        report["solution"] = report.pop("input")

    print(f"Total bug reports loaded: {len(bug_reports)}")
    return bug_reports

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def main():

    # Generate an enhanced dataset
    full_dataset = load_data_from_excel()

    # Initialize the BugResolver (TF-IDF + Neural Network)
    resolver = BugResolver(TextPreprocessor())

    # Train the resolver using the generated dataset
    resolver.train(full_dataset, epochs=50)

    # Sample bug description to test prediction
    bug = "Description of problem: When I am changing the special permissions (suid, sgid and sticky) in Fedora with the numeric method (chmod 2755 for example), is possible add the special permissions with numbers, but isn't possible to clear permissions. for example: is possible do chmod 4755 to add suid but if I use chmod 0755 the permission suid isn't remove. In other form, with chmod u-s the permission clear right. The problem is that using the numeric method only is possible add permission specials"

    # Predict the most relevant solution using the trained model
    solution = resolver.predict(bug)

    # Print the predicted solution
    print(solution)

if __name__ == "__main__":
    main()

Total bug reports loaded: 141
Epoch 10/50, Error: 0.9964
Epoch 20/50, Error: 0.9935
Epoch 30/50, Error: 0.9930
Epoch 40/50, Error: 0.9928
Epoch 50/50, Error: 0.9927
$ uname -i
