### Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import unicodedata
import os
import pickle
from bs4 import BeautifulSoup  
import emoji 
import nlpaug.augmenter.word as naw  
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline as ImbPipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from datasets import Dataset
from collections import Counter


### Loading the Dataset

In [None]:
file_path = r'C:\Users\buket\Desktop\new_v1\Bug_dataset.csv'
data = pd.read_csv(file_path)

print(data.head())

### Due to Large Dataset to Prevent Kernel Crash Code Below Can Be Used/ If No Needed Should Be Commented Out

In [None]:
# #In case of a kernel crash : 
# # Define the reduction percentage
# reduction_percentage = 0.2

# # Function to reduce the dataset size by class in the target column in place
# def reduce_dataset_inplace(data, target_column, reduction_percentage):
#     frames = []
#     for class_label in data[target_column].unique():
#         class_data = data[data[target_column] == class_label]
#         sample_size = int(len(class_data) * (1 - reduction_percentage))
#         reduced_class_data = class_data.sample(n=sample_size, random_state=42)
#         frames.append(reduced_class_data)
#     return pd.concat(frames, ignore_index=True)

# # Reduce the dataset in place
# data = reduce_dataset_inplace(data, target_column="priority", reduction_percentage=reduction_percentage)

# # View the distribution of the 'priority' column after reduction
# priority_distribution = data["priority"].value_counts()

# # Print the priority distribution
# print("Priority column distribution after reduction:")
# print(priority_distribution)


### Priority Column Preprocessing
##### Based on the approach Mapping should be changed and data = data[data['priority'] != 'P3'] should be dropped or kept

In [None]:
# Define the mapping logic
data = data[data['priority'] != '--'] #always drop '--'
data = data[data['priority'] != 'P3'] 
priority_mapping = {
    'P1': 0,  
    'P2': 0,  
    'P4': 1,  
    'P5': 2   
}


data['priority'] = data['priority'].map(priority_mapping)

print(data['priority'].value_counts())

unique_count = data['priority'].nunique()
print(f"Number of unique values: {unique_count}")

### Handling Missing and Categorical Values in Severity

In [None]:
print(data['severity'].value_counts(dropna=False))

In [None]:
print(data['severity'].value_counts(dropna=False))

# Drop rows with NaN values in the 'severity' column
data = data.dropna(subset=['severity'])

print(data['severity'].value_counts())
print(f"Updated dataset shape: {data.shape}")

# Severity mapping based on the website
severity_mapping = {
    'S2': 2,       # Major
    'S1': 1,       # Critical
    'S3': 3,       # Normal
    'S4': 4,       # Minor
    'normal': 3,   # Normal
    'major': 2,    # Major
    'minor': 4,    # Minor
    'critical': 1, # Critical
    'trivial': 4,  # Minor
    'blocker': 1   # Critical
}

data['severity_mapped'] = data['severity'].map(severity_mapping)

In [None]:
print(data['priority'].value_counts(dropna=False))

In [None]:
print(data[['severity', 'severity_mapped']].head())

print(data['severity_mapped'].unique())
print(data['severity_mapped'].value_counts(dropna=False))

### Converting and Extracting Features from Creation Time

In [None]:
# Convert 'creation_time' to datetime
data['creation_time'] = pd.to_datetime(data['creation_time'])

print(data['creation_time'].head())
print(data['creation_time'].dtypes)

# Get the maximum creation time from the dataset as the reference date
reference_date = data['creation_time'].max()
print(f"Reference date: {reference_date}")

# Calculate bug age in days
data['bug_age'] = (reference_date - data['creation_time']).dt.days
print(data[['creation_time', 'bug_age']].head())



### Handling Product Column

In [None]:
product_counts = data['product'].value_counts()

# Define the threshold for grouping smaller categories
threshold = 100
data['product'] = data['product'].apply(lambda x: x if product_counts[x] >= threshold else 'Other')

# Recalculate value counts after grouping smaller categories
updated_product_counts = data['product'].value_counts()

# Percentage of bugs grouped under "Other"
other_percentage = (updated_product_counts['Other'] / len(data)) * 100

updated_product_counts, other_percentage

unique_count = data['product'].nunique()
print(f"Number of unique values: {unique_count}")



### Handling Textual Columns
##### Not dropping the missing values in the description column here since they will be merged with the summary column

In [None]:
print(f"Missing values in 'description': {data['description'].isnull().sum()}")
data['description'] = data['description'].fillna('')
print(f"Missing values in 'description': {data['description'].isnull().sum()}")

# Merge the 'summary' and 'description' columns
data['merged_summary_description'] = data['summary'] + " " + data['description']

# Verify the new column
print(data[['summary', 'description', 'merged_summary_description']].head())

print(f"Missing values in 'merged_summary_description': {data['merged_summary_description'].isnull().sum()}")

### Dropping Unnecessary Columns

In [11]:
# Drop multiple columns
data = data.drop(['summary', 'creation_time', 'id', 'description', 'severity'], axis=1)


### Text Cleaning: Replacing Contractions and Removing Special Characters

**1. Replacing Contractions:**

In [12]:
# Define a regular expression pattern with replacements
def replace_contractions(text):
    contractions = {
        r"don´t": "do not",
        r"isn´t": "is not",
        r"hasn´t": "has not",
        r"doesn´t": "does not",
        r"haven´t": "have not",
        r"aren´t": "are not",
        r"couldn´t": "could not",
        r"can´t": "can not"
    }
    # Compile a regex pattern
    pattern = re.compile("|".join(contractions.keys()), flags=re.IGNORECASE)
    return pattern.sub(lambda x: contractions[x.group().lower()], text)

# Apply the function to the DataFrame column
data['merged_summary_description'] = data['merged_summary_description'].apply(replace_contractions)

**2. Cleaning Special Characters and Numbers:**

In [13]:
# Cleaning functions
def remove_html_tags(text):
    if not isinstance(text, str):
        return ""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_urls(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

def remove_emails(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r"[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+",'',text)

def remove_control_characters(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)

def remove_emojis(text):
    if not isinstance(text, str):
        return ""
    return emoji.replace_emoji(text, replace="")

def remove_punctuation_except_commas_periods(text):
    if not isinstance(text, str):
        return ""
    translator = str.maketrans('', '', string.punctuation.replace(",", "").replace(".", ""))
    return text.translate(translator)

def remove_non_ascii(text):
    if not isinstance(text, str):
        return ""
    return ''.join(i for i in text if ord(i)<128)

def normalize_unicode(text):
    if not isinstance(text, str):
        return ""
    return unicodedata.normalize('NFKC', text)

def normalize_whitespace(text):
    if not isinstance(text, str):
        return ""
    return " ".join(text.split())

def preprocessor(text_or_series):
    if isinstance(text_or_series, pd.Series):
        return text_or_series.apply(lambda text: normalize_whitespace(normalize_unicode(remove_non_ascii(remove_punctuation_except_commas_periods(remove_emojis(remove_control_characters(remove_emails(remove_urls(remove_html_tags(str(text)))))))))))
    elif isinstance(text_or_series, str):
        return normalize_whitespace(normalize_unicode(remove_non_ascii(remove_punctuation_except_commas_periods(remove_emojis(remove_control_characters(remove_emails(remove_urls(remove_html_tags(text)))))))))
    else:
        return ""


In [None]:
data_preprocessed = data
data_preprocessed['merged_summary_description'] = preprocessor(data['merged_summary_description'])

In [15]:
# data_preprocessed.to_excel("test_v2.xlsx", sheet_name="Sheet1", index=False)

In [None]:
data_preprocessed.columns

In [17]:
# # Export the cleaned DataFrame to an Excel file
# file_path = "Preprocessed_Cleaned_Data.xlsx"
# data_preprocessed.to_excel(file_path, index=True)

### Train-Test Split

In [None]:
# Define features (X) and target (y)
X = data_preprocessed.drop(['priority'], axis=1)  # Adjust 'target' to your actual target column name
y = data_preprocessed['priority']  # Replace 'target' with the actual column name of your target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Output shapes for verification
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


### Applying NLP augmentation only to train data
##### This section should be commented out based on the approach - without NLP augmentation

In [19]:
train_data_for_NLP = pd.concat([X_train, y_train], axis=1)

In [None]:
print(train_data_for_NLP['priority'].value_counts())

In [21]:
def augment_minority_class(df, text_column, label_column, aug_n=1):
    """
    Augment text samples of the minority class using synonyms.

    Args:
        df (pd.DataFrame): Input dataset with text and labels.
        text_column (str): Name of the column containing text data.
        label_column (str): Name of the column containing class labels.
        aug_n (int): Number of augmented samples to create per row.

    Returns:
        pd.DataFrame: Original dataset appended with augmented samples.
    """
    # Identify the minority class
    minority_class = df[label_column].value_counts().idxmin()

    # Filter rows belonging to the minority class
    minority_class_rows = df[df[label_column] == minority_class]

    # Initialize synonym augmenter
    synonym_aug = naw.SynonymAug(aug_src='wordnet')

    augmented_rows = []

    def dynamic_synonym_augmentation(sentence, n=1):
        """
        Perform synonym-based augmentation with dynamic word count.

        Args:
            text (str): Input text to augment.
            n (int): Number of augmented samples to generate.

        Returns:
            list: List of augmented text samples.
        """
        # Tokenize the sentence into words
        words = sentence.split()
        num_words = len(words)

        # Dynamically set the maximum number of words to augment
        if num_words <= 3:
            aug_max = 1  # Augment at most 1 word for short texts
        elif num_words <= 20:
            aug_max = max(1, int(0.2 * num_words))  # Augment 20% of words for medium-length texts
        else:
            aug_max = max(1, int(0.1 * num_words))  # Augment 10% of words for long texts

        # Update the augmenter with dynamic `aug_max`
        synonym_aug.aug_max = aug_max


# Generate multiple augmented versions
        augmented_sentences = synonym_aug.augment(sentence, n=n)
        return augmented_sentences

    # Loop through each row in the minority class
    for i in minority_class_rows.index:
        original_row = df.loc[i].copy()  # Get the original row as a Series

        # Augment the text column
        original_text = original_row[text_column]
        augmented_versions = dynamic_synonym_augmentation(original_text, n=aug_n)

        # Create new rows for each augmented version
        for aug_text in augmented_versions:
            augmented_row = original_row.copy()  # Copy the original row
            augmented_row[text_column] = aug_text  # Replace only the text column
            augmented_rows.append(augmented_row)

    # Convert augmented rows to a DataFrame
    augmented_df = pd.DataFrame(augmented_rows)

    # Combine original data with augmented data
    combined_df = pd.concat([df, augmented_df], ignore_index=True)
    return combined_df


In [None]:
train_data_for_NLP.shape

In [23]:
Augmented_Train_Data = augment_minority_class(train_data_for_NLP, text_column='merged_summary_description', label_column='priority', aug_n=1)

In [None]:
print(Augmented_Train_Data['priority'].value_counts())

In [25]:
X_Train_Augmented = Augmented_Train_Data.drop(['priority'], axis=1)
y_Train_Augmented = Augmented_Train_Data['priority']


In [None]:
X_Train_Augmented['merged_summary_description']

In [None]:
print(y_Train_Augmented.value_counts())

In [32]:
X_train = X_Train_Augmented
X_test=X_test
y_train = y_Train_Augmented
y_test = y_test

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Fine Tuning The BERT Model

In [34]:
# # Combine train and test datasets
# feature_data = pd.concat([X_train, X_test], axis=0)

# # Combine labels if needed
# label_data = pd.concat([y_train, y_test], axis=0)

# # Verify the result
# print(feature_data.shape)
# print(label_data.shape)

In [35]:
# from datasets import Dataset
# from sklearn.model_selection import train_test_split

# # Split the data into train and validation sets
# # train_texts, val_texts, train_labels, val_labels = train_test_split(
# #     feature_data['merged_summary_description'], 
# #     label_data, 
# #     test_size=0.2, 
# #     stratify=label_data, 
# #     random_state=42
# # )

# train_texts = X_train['merged_summary_description']
# train_labels = y_train

# val_texts = X_test['merged_summary_description']
# val_labels = y_test

# # Create Hugging Face Dataset objects
# train_dataset = Dataset.from_dict({
#     'text': train_texts.tolist(),
#     'label': train_labels.tolist()
# })

# val_dataset = Dataset.from_dict({
#     'text': val_texts.tolist(),
#     'label': val_labels.tolist()
# })


In [36]:
# from transformers import AutoTokenizer

# # Load BERT tokenizer
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# # Tokenize datasets
# train_dataset = train_dataset.map(tokenize_function, batched=True)
# val_dataset = val_dataset.map(tokenize_function, batched=True)

# # Set dataset format for PyTorch
# train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
# val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [37]:
# num_labels = data_preprocessed['priority'].nunique()
# print(num_labels)

In [38]:
# from transformers import AutoModelForSequenceClassification

# # Define the number of labels (classes)
# num_labels = data_preprocessed['priority'].nunique()

# # Load the model
# model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


In [39]:
# from transformers import TrainingArguments, Trainer

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./bert-finetuned",          # Directory to save the model
#     evaluation_strategy="epoch",           # Evaluate every epoch
#     save_strategy="epoch",                 # Save checkpoint every epoch
#     logging_dir="./logs",                  # Directory for logs
#     logging_steps=10,                      # Log every 10 steps
#     per_device_train_batch_size=16,        # Batch size for training
#     per_device_eval_batch_size=32,         # Batch size for evaluation
#     num_train_epochs=3,                    # Number of epochs
#     learning_rate=5e-5,                    # Learning rate
#     weight_decay=0.01,                     # Weight decay
#     save_total_limit=2,                    # Limit the number of saved checkpoints
#     seed=42,                               # Random seed
#     load_best_model_at_end=True            # Load the best model at the end of training
# )

# # Define the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
# )


In [40]:
#trainer.train()

In [41]:
# model.save_pretrained("./bert-finetuned")
# tokenizer.save_pretrained("./bert-finetuned")

### Generating Embeddings

In [42]:
# # Evaluate on validation set
# results = trainer.evaluate()
# print(results)

In [43]:
# from transformers import AutoModel

# # Load the fine-tuned model
# fine_tuned_model = AutoModel.from_pretrained("./bert-finetuned")

# Generate embeddings as before using your `generate_bert_embeddings` function.

In [44]:
# from transformers import AutoTokenizer

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./bert-finetuned")

# # Define your function to generate embeddings
# def generate_bert_embeddings(texts, tokenizer, model, batch_size=32, max_length=512):
#     model.eval()  # Set the model to evaluation mode
#     embeddings = []

#     with torch.no_grad():
#         for i in range(0, len(texts), batch_size):
#             batch = texts[i:i + batch_size]
#             inputs = tokenizer(
#                 batch,
#                 padding=True,
#                 truncation=True,
#                 max_length=max_length,
#                 return_tensors="pt"
#             )
#             outputs = model(**inputs)
#             cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings
#             embeddings.extend(cls_embeddings)

#     return np.vstack(embeddings)


In [45]:
# # Convert datasets to lists
# train_texts = X_train['merged_summary_description'].tolist()
# test_texts = X_test['merged_summary_description'].tolist()

# # Generate embeddings
# train_embeddings = generate_bert_embeddings(train_texts, tokenizer, fine_tuned_model)
# test_embeddings = generate_bert_embeddings(test_texts, tokenizer, fine_tuned_model)

# # Check shapes
# print("Train embeddings shape:", train_embeddings.shape)
# print("Test embeddings shape:", test_embeddings.shape)


In [46]:
# import numpy as np

# # Save train and test embeddings to .npy files
# np.save('finetuned_train_embeddings.npy', train_embeddings)
# np.save('finetuned_test_embeddings.npy', test_embeddings)

# print("Embeddings saved to 'train_embeddings.npy' and 'test_embeddings.npy'.")


In [None]:
import numpy as np

train_embeddings = np.load('finetuned_train_embeddings.npy')
test_embeddings = np.load('finetuned_test_embeddings.npy')

print("Train Embeddings Shape:", train_embeddings.shape)
print("Test Embeddings Shape:", test_embeddings.shape)

# Check if the number of embeddings matches the number of samples
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_test))

### Combining Embeddings with Preprocessed Numerical and Categorical Features

In [48]:
# 1. Define the preprocessing pipeline for numerical and categorical features
numerical_features = ['bug_age', 'severity_mapped']
categorical_features = ['product']

# Numerical transformations
scaler = StandardScaler()

# Categorical transformations
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sparse=False returns a dense array

# Apply transformations using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_features),
        ('cat', ohe, categorical_features)
    ])

# 2. Fit and transform the training data
X_train_numerical = X_train[numerical_features]
X_test_numerical = X_test[numerical_features]

# Fit and transform numerical features
numerical_features_train = scaler.fit_transform(X_train_numerical)
numerical_features_test = scaler.transform(X_test_numerical)

# Apply OneHotEncoder to the categorical 'product' feature
categorical_features_train = ohe.fit_transform(X_train[categorical_features])
categorical_features_test = ohe.transform(X_test[categorical_features])

# 3. Concatenate the embeddings with transformed numerical and categorical features
X_train_combined = np.hstack((train_embeddings, numerical_features_train, categorical_features_train))
X_test_combined = np.hstack((test_embeddings, numerical_features_test, categorical_features_test))

# Now X_train_combined and X_test_combined include BERT embeddings, numerical, and categorical features


In [49]:
X_train = X_train_combined
X_test = X_test_combined
y_train = y_Train_Augmented
y_test = y_test

In [None]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

### Model Training

**1.Model-Based Methods for Addressing Class Imbalance**

In [51]:
os.makedirs("results_finetuned_bert_uncased_none", exist_ok=True)  # Create a folder to save results


# Define classifiers and their parameter grids
classifiers = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'param_grid': {
            'classifier__n_estimators': [100, 200, 300],  
            'classifier__max_depth': [None, 10],  
            'classifier__min_samples_split': [5, 10], 
            'classifier__min_samples_leaf': [1, 4],  
            'classifier__max_features': ['sqrt'],  
            'classifier__bootstrap': [True],  
            'classifier__criterion': ['gini', 'entropy']  
        }
    },
    'XGBoost': {
        'model': XGBClassifier(eval_metric="mlogloss", random_state=42, 
                               objective='multi:softmax', num_class=3),
        'param_grid': {
            'classifier__n_estimators': [100, 200, 300],  
            'classifier__max_depth': [3, 6, 15],  
            'classifier__learning_rate': [0.01, 0.1],  
            'classifier__subsample': [0.6, 0.8],  
            'classifier__colsample_bytree': [0.8, 1.0], 
            'classifier__gamma': [0, 0.1, 0.2],  
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, class_weight='balanced', 
                                    multi_class='multinomial', max_iter=1000),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],  
            'classifier__penalty': ['l2'],  
            'classifier__solver': ['lbfgs'],  
            'classifier__max_iter': [500, 1000],  
        }
    }
}

# Loop through classifiers and perform GridSearchCV
for name, config in classifiers.items():
    print(f"Training and tuning {name}...")
    pipeline = Pipeline([
        ('classifier', config['model'])
    ])
    
    # GridSearchCV
    grid_search = GridSearchCV(
        pipeline,
        param_grid=config['param_grid'],
        scoring='f1_macro', 
        cv=3,  
        verbose=2,
        n_jobs=1
    )
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Best parameters and evaluation
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Generate classification report
    report = classification_report(y_test, y_pred, digits=3)
    print(f"\nResults for {name} :\n")
    print(report)
    print("=" * 50)

        
    # Save the model
    model_filename = f"results_finetuned_bert_uncased_none/_{name}_model.pkl"
    with open(model_filename, 'wb') as model_file:
        pickle.dump(best_model, model_file)
    print(f"Model saved to {model_filename}")
        
     # Save the classification report
    report_filename = f"results_finetuned_bert_uncased_none/_{name}_report.txt"
    with open(report_filename, 'w') as report_file:
        report_file.write(f"Resampling Technique: \n")
        report_file.write(f"Classifier: {name}\n")
        report_file.write(f"Best Parameters: {grid_search.best_params_}\n\n")
        report_file.write("Classification Report:\n")
        report_file.write(report)
    print(f"Classification report saved to {report_filename}")




**2.Resampling Techniques Applied for Adressing Class Imbalance**

In [None]:

os.makedirs("results_finedtuned_bert_uncased_paramgrid", exist_ok=True)


resamplers = {
    'SMOTE': SMOTE(sampling_strategy={0: 5000, 1: 5800, 2: 7072}, k_neighbors=5, random_state=42),
    
    # For SMOTETomek, pass the SMOTE instance as an argument (Tomek Links is applied automatically)
    'SMOTETomek': SMOTETomek(smote=SMOTE(sampling_strategy={0: 5000, 1: 5800, 2: 7072}, k_neighbors=5, random_state=42), random_state=42),
    
    # Tomek Links - no need for sampling_strategy for this since it's handled automatically
    'Tomek Links': TomekLinks(sampling_strategy='auto'),
}


classifiers = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'classifier__n_estimators': [100, 200, 300], 
            'classifier__max_depth': [None, 10],  
            'classifier__min_samples_split': [5, 10],  
            'classifier__min_samples_leaf': [1, 4],  
            'classifier__max_features': ['sqrt'], 
            'classifier__bootstrap': [True],  
            'classifier__criterion': ['gini', 'entropy'] 
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, objective='multi:softmax', num_class=3),
        'param_grid': {
            'classifier__n_estimators': [100, 200, 300],  
            'classifier__max_depth': [3, 6, 15],  
            'classifier__learning_rate': [0.01, 0.1], 
            'classifier__subsample': [0.6, 0.8], 
            'classifier__colsample_bytree': [0.8, 1.0], 
            'classifier__gamma': [0, 0.1, 0.2],  
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],  
            'classifier__penalty': ['l2'],  
            'classifier__solver': ['lbfgs'],  
            'classifier__max_iter': [500, 1000],  
        }
    }
}

# Loop through resampling techniques
for resampler_name, resampler in resamplers.items():
    print(f"\nUsing Resampling Technique: {resampler_name}")
    
    # Display class distribution before resampling
    print(f"Class distribution before resampling: {Counter(y_train)}")
    
    # Loop through classifiers
    for name, config in classifiers.items():
        print(f"\nTraining and tuning {name} with {resampler_name}...")
        
        # Create pipeline with resampling integrated
        pipeline = ImbPipeline([
            ('resampler', resampler),
            ('classifier', config['model']),
        ])
        
        # GridSearchCV
        grid_search = GridSearchCV(
            pipeline,
            param_grid=config['param_grid'],
            scoring='f1_macro',
            cv=3,
            verbose=2,
            n_jobs=1
        )
        
        grid_search.fit(X_train, y_train)

         
        # Evaluate the model
        print(f"Best parameters for {name} with {resampler_name}: {grid_search.best_params_}")
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        # Classification report
        report = classification_report(y_test, y_pred, digits=3)
        print(f"\nResults for {name} with {resampler_name}:\n")
        print(report)
        
        # Save the model
        model_filename = f"results_finedtuned_bert_uncased_paramgrid/{resampler_name}_{name}_model.pkl"
        with open(model_filename, 'wb') as model_file:
            pickle.dump(best_model, model_file)
        print(f"Model saved to {model_filename}")
        
        # Save the classification report
        report_filename = f"results_finedtuned_bert_uncased_paramgrid/{resampler_name}_{name}_report.txt"
        with open(report_filename, 'w') as report_file:
            report_file.write(f"Resampling Technique: {resampler_name}\n")
            report_file.write(f"Classifier: {name}\n")
            report_file.write(f"Best Parameters: {grid_search.best_params_}\n\n")
            report_file.write("Classification Report:\n")
            report_file.write(report)
        print(f"Classification report saved to {report_filename}")
