### Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import nlpaug.augmenter.word as naw
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from imblearn.combine import SMOTETomek
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import os
import pickle

### Loading the Dataset

In [None]:
file_path = r'C:\Users\buket\Desktop\THESISDOCUMENTS\Twoclasses_withNLP\Bug_dataset.csv'
data = pd.read_csv(file_path)

print(data.head())

### Priority Column Preprocessing
##### Based on the approach Mapping should be changed and data = data[data['priority'] != 'P3'] should be dropped or kept

In [None]:
# Define the mapping logic
data = data[data['priority'] != '--'] #always drop '--'
data = data[data['priority'] != 'P3'] # Depending on the approach keep or erase
priority_mapping = { #Depending on the approach change mapping
    'P1': 0,  
    'P2': 0,  
    'P4': 1,  
    'P5': 2   
}


data['priority'] = data['priority'].map(priority_mapping)

print(data['priority'].value_counts())

unique_count = data['priority'].nunique()
print(f"Number of unique values: {unique_count}")

### Handling Missing and Categorical Values in Severity

In [None]:
print(data['severity'].value_counts(dropna=False))

In [None]:
print(data['severity'].value_counts(dropna=False))

# Drop rows with NaN values in the 'severity' column
data = data.dropna(subset=['severity'])

print(data['severity'].value_counts())
print(f"Updated dataset shape: {data.shape}")

# Severity mapping based on the website
severity_mapping = {
    'S2': 2,       # Major
    'S1': 1,       # Critical
    'S3': 3,       # Normal
    'S4': 4,       # Minor
    'normal': 3,   # Normal
    'major': 2,    # Major
    'minor': 4,    # Minor
    'critical': 1, # Critical
    'trivial': 4,  # Minor
    'blocker': 1   # Critical
}

data['severity_mapped'] = data['severity'].map(severity_mapping)

In [None]:
print(data[['severity', 'severity_mapped']].head())

print(data['severity_mapped'].unique())
print(data['severity_mapped'].value_counts(dropna=False))

### Converting and Extracting Features from Creation Time

In [None]:
# Convert 'creation_time' to datetime
data['creation_time'] = pd.to_datetime(data['creation_time'])

print(data['creation_time'].head())
print(data['creation_time'].dtypes)

# Get the maximum creation time from the dataset as the reference date
reference_date = data['creation_time'].max()
print(f"Reference date: {reference_date}")

# Calculate bug age in days
data['bug_age'] = (reference_date - data['creation_time']).dt.days
print(data[['creation_time', 'bug_age']].head())



### Handling Product Column

In [None]:
product_counts = data['product'].value_counts()

# Define the threshold for grouping smaller categories
threshold = 100
data['product'] = data['product'].apply(lambda x: x if product_counts[x] >= threshold else 'Other')

# Recalculate value counts after grouping smaller categories
updated_product_counts = data['product'].value_counts()

# Percentage of bugs grouped under "Other"
other_percentage = (updated_product_counts['Other'] / len(data)) * 100

updated_product_counts, other_percentage

unique_count = data['product'].nunique()
print(f"Number of unique values: {unique_count}")



### Handling Textual Columns
##### Not dropping the missing values in the description column here since they will be merged with the summary column

In [None]:
print(f"Missing values in 'description': {data['description'].isnull().sum()}")
data['description'] = data['description'].fillna('')
print(f"Missing values in 'description': {data['description'].isnull().sum()}")

# Merge the 'summary' and 'description' columns
data['merged_summary_description'] = data['summary'] + " " + data['description']

# Verify the new column
print(data[['summary', 'description', 'merged_summary_description']].head())

print(f"Missing values in 'merged_summary_description': {data['merged_summary_description'].isnull().sum()}")

### Dropping Unnecessary Columns

In [13]:
# Drop multiple columns
data = data.drop(['summary', 'creation_time', 'id', 'description', 'severity'], axis=1)


In [None]:
data.columns

### Text Cleaning: Replacing Contractions and Removing Special Characters

**1. Replacing Contractions:**

In [15]:
# Define a regular expression pattern with replacements
def replace_contractions(text):
    contractions = {
        r"don´t": "do not",
        r"isn´t": "is not",
        r"hasn´t": "has not",
        r"doesn´t": "does not",
        r"haven´t": "have not",
        r"aren´t": "are not",
        r"couldn´t": "could not",
        r"can´t": "can not"
    }
    # Compile a regex pattern
    pattern = re.compile("|".join(contractions.keys()), flags=re.IGNORECASE)
    return pattern.sub(lambda x: contractions[x.group().lower()], text)

# Apply the function to the DataFrame column
data['merged_summary_description'] = data['merged_summary_description'].apply(replace_contractions)

**2. Cleaning Special Characters and Numbers:**

In [16]:

def clean_special_characters_and_numbers(text):
    """
    Replace all special characters and numbers in a text with a blank space.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text with special characters and numbers replaced by spaces.
    """
    # Remove all special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', ' ', text)  # Keep only alphabetic characters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text


def _clean_text(text):
    """
    Clean individual text by removing special characters, numbers, and ensuring it is a string.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text.
    """
    # Ensure the input is a string
    if not isinstance(text, str):
        text = str(text)
    return clean_special_characters_and_numbers(text)


def preprocessor(text_or_series):
    """
    Preprocess text or pandas Series by cleaning special characters and numbers.

    Args:
        text_or_series (str or pd.Series): Input text or pandas Series.

    Returns:
        str or pd.Series: Cleaned text or Series.
    """
    if isinstance(text_or_series, pd.Series):
        # Apply the preprocessor to each element of the Series
        return text_or_series.apply(_clean_text)
    else:
        # Otherwise, treat it as a single text input
        return _clean_text(text_or_series)


In [17]:
data_preprocessed = data
data_preprocessed['merged_summary_description'] = preprocessor(data['merged_summary_description'])

In [None]:
data_preprocessed['merged_summary_description']

In [None]:
data_preprocessed.columns

In [None]:
print(data_preprocessed['priority'].value_counts())

### Train-Test Split

In [None]:
# Define features (X) and target (y)
X = data_preprocessed.drop(['priority'], axis=1)  
y = data_preprocessed['priority']  

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Output shapes for verification
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
print(y_test.value_counts())

### Applying NLP augmentation only to train data
##### This section should be commented out based on the approach - without NLP augmentation

In [23]:
train_data_for_NLP = pd.concat([X_train, y_train], axis=1)

In [None]:
print(train_data_for_NLP['priority'].value_counts())

In [25]:

def augment_minority_class(df, text_column, label_column, aug_n=1):
    """
    Augment text samples of the minority class using synonyms.

    Args:
        df (pd.DataFrame): Input dataset with text and labels.
        text_column (str): Name of the column containing text data.
        label_column (str): Name of the column containing class labels.
        aug_n (int): Number of augmented samples to create per row.

    Returns:
        pd.DataFrame: Original dataset appended with augmented samples.
    """
    # Identify the minority class
    minority_class = df[label_column].value_counts().idxmin()

    # Filter rows belonging to the minority class
    minority_class_rows = df[df[label_column] == minority_class]

    # Initialize synonym augmenter
    synonym_aug = naw.SynonymAug(aug_src='wordnet')

    augmented_rows = []

    def dynamic_synonym_augmentation(sentence, n=1):
        """
        Perform synonym-based augmentation with dynamic word count.

        Args:
            sentence (str): Input text to augment.
            n (int): Number of augmented samples to generate.

        Returns:
            list: List of augmented text samples.
        """
        if not sentence or not isinstance(sentence, str):
            return [sentence] * n  # Return the original sentence if invalid

        # Tokenize the sentence into words
        words = sentence.split()
        num_words = len(words)

        # Dynamically set the maximum number of words to augment
        if num_words <= 3:
            aug_max = 1  # Augment at most 1 word for short texts
        elif num_words <= 20:
            aug_max = max(1, int(0.2 * num_words))  # Augment 20% of words for medium-length texts
        else:
            aug_max = max(1, int(0.1 * num_words))  # Augment 10% of words for long texts

        # Update the augmenter with dynamic `aug_max`
        synonym_aug.aug_max = aug_max

        # Generate augmented versions
        augmented_sentences = synonym_aug.augment(sentence, n=n)
        return augmented_sentences

    # Loop through each row in the minority class
    for i in minority_class_rows.index:
        original_row = df.loc[i].copy()  # Get the original row as a Series

        # Augment the text column
        original_text = original_row[text_column]
        augmented_versions = dynamic_synonym_augmentation(original_text, n=aug_n)

        # Create new rows for each augmented version
        for aug_text in augmented_versions:
            augmented_row = original_row.copy()  # Copy the original row
            augmented_row[text_column] = aug_text  # Replace only the text column
            augmented_rows.append(augmented_row)

    # Convert augmented rows to a DataFrame
    augmented_df = pd.DataFrame(augmented_rows)

    # Combine original data with augmented data
    combined_df = pd.concat([df, augmented_df], ignore_index=True)
    return combined_df


In [None]:
train_data_for_NLP.shape

In [27]:
Augmented_Train_Data = augment_minority_class(train_data_for_NLP, text_column='merged_summary_description', label_column='priority', aug_n=1)

In [None]:
print(Augmented_Train_Data['priority'].value_counts())

In [None]:
print(Augmented_Train_Data.isnull().sum())

In [None]:
Augmented_Train_Data.columns

In [31]:
X_Train_Augmented = Augmented_Train_Data.drop(['priority'], axis=1)
y_Train_Augmented = Augmented_Train_Data['priority']


In [None]:
X_Train_Augmented.shape

In [None]:
X_Train_Augmented['merged_summary_description']

In [None]:
y_Train_Augmented.shape

In [None]:
print(y_Train_Augmented.value_counts())

### Tokenization and Lemmatization

In [36]:
lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stop_words=set(stopwords.words('english'))


In [37]:
def tokenizer_lemmetizer(text):

    text=tokenizer.tokenize(text)
    text= [token for token in text if token not in stop_words]
    return ' '.join([lemmatizer.lemmatize(word) for word in text])



In [None]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)


In [39]:
X_Train_Augmented['merged_summary_description'] = X_Train_Augmented['merged_summary_description'].apply(tokenizer_lemmetizer)
X_test['merged_summary_description'] = X_test['merged_summary_description'].apply(tokenizer_lemmetizer)



In [40]:
X_train = X_Train_Augmented
X_test=X_test
y_train = y_Train_Augmented
y_test = y_test

In [None]:

print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

### Preprocessing Pipeline with TF-IDF 

**1.Model-Based Methods for Addressing Class Imbalance**

In [None]:
os.makedirs("results_tfidf_none", exist_ok=True)  # Create a folder to save results


# Features
text_feature = 'merged_summary_description'
numeric_features = ['bug_age', 'severity_mapped']
categorical_features = ['product']

# Define transformers
text_transformer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2), max_df=0.8)
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)
classifiers = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'param_grid': {
            'classifier__n_estimators': [100, 300],  
            'classifier__max_depth': [None, 10],  
            'classifier__min_samples_split': [5, 10], 
            'classifier__min_samples_leaf': [1, 4],  
            'classifier__max_features': ['sqrt'],  
            'classifier__bootstrap': [True],  
            'classifier__criterion': ['gini', 'entropy']  
        }
    },
    'XGBoost': {
        'model': XGBClassifier(eval_metric="mlogloss", random_state=42, 
                               objective='multi:softmax', num_class=3),
        'param_grid': {
            'classifier__n_estimators': [100, 300],  
            'classifier__max_depth': [6, 15],  
            'classifier__learning_rate': [0.01, 0.1],  
            'classifier__subsample': [0.6, 0.8],  
            'classifier__colsample_bytree': [0.8, 1.0], 
            'classifier__gamma': [0, 0.1, 0.2],  
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, class_weight='balanced', 
                                    multi_class='multinomial', max_iter=1000),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],  
            'classifier__penalty': ['l2'],  
            'classifier__solver': ['lbfgs'],  
            'classifier__max_iter': [500, 1000],  
        }
    }
}
# Loop through classifiers and perform GridSearchCV
for name, config in classifiers.items():
    print(f"Training and tuning {name}...")
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', config['model'])
    ])
    
    # GridSearchCV
    grid_search = GridSearchCV(
        pipeline,
        param_grid=config['param_grid'],
        scoring='f1',  # Binary classification friendly metric
        cv=3,  # 3-fold cross-validation
        verbose=2,
        n_jobs=-1
    )
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Best parameters and evaluation
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Generate classification report
    report = classification_report(y_test, y_pred, digits=3)
    print(f"\nResults for {name} :\n")
    print(report)
    print("=" * 50)

        
    # Save the model
    model_filename = f"results_tfidf_none/_{name}_model.pkl"
    with open(model_filename, 'wb') as model_file:
        pickle.dump(best_model, model_file)
    print(f"Model saved to {model_filename}")
        
     # Save the classification report
    report_filename = f"results_tfidf_none/_{name}_report.txt"
    with open(report_filename, 'w') as report_file:
        report_file.write(f"Resampling Technique: \n")
        report_file.write(f"Classifier: {name}\n")
        report_file.write(f"Best Parameters: {grid_search.best_params_}\n\n")
        report_file.write("Classification Report:\n")
        report_file.write(report)
    print(f"Classification report saved to {report_filename}")


**2.Resampling Techniques Applied for Adressing Class Imbalance**

In [None]:
# Ensure output directory exists
os.makedirs("results_tfidf", exist_ok=True)

# Features
text_feature = 'merged_summary_description'
numeric_features = ['bug_age', 'severity_mapped']
categorical_features = ['product']

# Define transformers
text_transformer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2), max_df=0.8)
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features),
    ]
)


resamplers = {
    'SMOTE': SMOTE(sampling_strategy={0: 5000, 1: 5800, 2: 7072}, k_neighbors=5, random_state=42),
    
    # For SMOTETomek, pass the SMOTE instance as an argument (Tomek Links is applied automatically)
    'SMOTETomek': SMOTETomek(smote=SMOTE(sampling_strategy={0: 5000, 1: 5800, 2: 7072}, k_neighbors=5, random_state=42), random_state=42),
    
    # Tomek Links - no need for sampling_strategy for this since it's handled automatically
    'Tomek Links': TomekLinks(sampling_strategy='auto'),
}


classifiers = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'classifier__n_estimators': [100, 300], 
            'classifier__max_depth': [None, 10],  
            'classifier__min_samples_split': [5, 10],  
            'classifier__min_samples_leaf': [1, 4],  
            'classifier__max_features': ['sqrt'], 
            'classifier__bootstrap': [True],  
            'classifier__criterion': ['gini', 'entropy'] 
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, objective='multi:softmax', num_class=3),
        'param_grid': {
            'classifier__n_estimators': [100, 300],  
            'classifier__max_depth': [6, 15],  
            'classifier__learning_rate': [0.01, 0.1], 
            'classifier__subsample': [0.6, 0.8], 
            'classifier__colsample_bytree': [0.8, 1.0], 
            'classifier__gamma': [0, 0.1, 0.2],  
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],  
            'classifier__penalty': ['l2'],  
            'classifier__solver': ['lbfgs'],  
            'classifier__max_iter': [500, 1000],  
        }
    }
}

# Loop through resampling techniques
for resampler_name, resampler in resamplers.items():
    print(f"\nUsing Resampling Technique: {resampler_name}")
    
    # Display class distribution before resampling
    print(f"Class distribution before resampling: {Counter(y_train)}")
    
    # Loop through classifiers
    for name, config in classifiers.items():
        print(f"\nTraining and tuning {name} with {resampler_name}...")
        
        # Create pipeline with resampling integrated
        pipeline = ImbPipeline([
            ('preprocessor', preprocessor),
            ('resampler', resampler),
            ('classifier', config['model']),
        ])
        
        # GridSearchCV
        grid_search = GridSearchCV(
            pipeline,
            param_grid=config['param_grid'],
            scoring='f1_macro',
            cv=3,
            verbose=2,
            n_jobs=1
        )
        
        grid_search.fit(X_train, y_train)

         
        # Evaluate the model
        print(f"Best parameters for {name} with {resampler_name}: {grid_search.best_params_}")
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        # Classification report
        report = classification_report(y_test, y_pred, digits=3)
        print(f"\nResults for {name} with {resampler_name}:\n")
        print(report)
        
        # Save the model
        model_filename = f"results_tfidf/{resampler_name}_{name}_model.pkl"
        with open(model_filename, 'wb') as model_file:
            pickle.dump(best_model, model_file)
        print(f"Model saved to {model_filename}")
        
        # Save the classification report
        report_filename = f"results_tfidf/{resampler_name}_{name}_report.txt"
        with open(report_filename, 'w') as report_file:
            report_file.write(f"Resampling Technique: {resampler_name}\n")
            report_file.write(f"Classifier: {name}\n")
            report_file.write(f"Best Parameters: {grid_search.best_params_}\n\n")
            report_file.write("Classification Report:\n")
            report_file.write(report)
        print(f"Classification report saved to {report_filename}")
