Confidence-Based Filtering with Adaptive Sampling (CBFAS) Method

### Importing Basic Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as ml
import pandas as pd

### Importing Machine Learning Modules

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

### Loading Dataset

In [None]:
dataset0=pd.read_excel("/content/RCOMSLW+.xlsx")

### Dividing Dataset into Independent and Dependent Features

In [None]:
from typing import ValuesView
X = dataset0.iloc[:,1:19]
y = dataset0.iloc[:,21]

### Encoding Dependent Variable

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

### Dropping Unnecessary Features

In [None]:
X=X.drop(['FLOW/MAXFLOW','LTYPE','GTYPE','LO','DIAGONAL','WIDTH','LA','PHONE','MIDDLE'],axis=1)

### Defining Minority Point Zone Classification Function

In [None]:
from scipy.spatial.distance import euclidean

def classify_minority_point_zone(minority_point, X, y, k):
    """
    Classifies a minority point into 'safe', 'dangerous', or 'noise' zone
    based on its k-nearest neighbors.

    Args:
        minority_point (np.ndarray): A single data point from the minority class.
        X (np.ndarray): The complete dataset.
        y (np.ndarray): The corresponding labels.
        k (int): The number of nearest neighbors to consider.

    Returns:
        str: The zone of the minority point ("safe", "dangerous", or "noise").
    """
    distances = np.array([euclidean(minority_point, x) for x in X])
    # Get indices of sorted distances, excluding the point itself
    nearest_indices = np.argsort(distances)[1:k+1]
    nearest_labels = y[nearest_indices]
    minority_class = 1 # Assuming minority class is labeled as 1
    majority_class = 0 # Assuming majority class is labeled as 0

    minority_neighbors_count = np.sum(nearest_labels == minority_class)
    majority_neighbors_count = np.sum(nearest_labels == majority_class)

    # Refined classification logic based on user's description:
    # Noise: Points primarily surrounded by majority neighbors (near majority core)
    if majority_neighbors_count >= k * 0.8: # Consider a point noise if at least 80% of neighbors are majority
         return "noise"
    # Safe: Points with a high proportion of minority neighbors (away from majority)
    elif minority_neighbors_count >= k * 0.8: # Consider a point safe if at least 80% of neighbors are minority
         return "safe"
    # Dangerous: Points near the boundary, with a mix of majority and minority neighbors
    else:
        return "dangerous"

### Defining Majority Point Zone Classification Function

In [None]:
def classify_majority_point_zone(majority_point, X, y, k):
    """
    Classifies a majority point into 'safe', 'dangerous', or 'noise' zone
    based on its k-nearest neighbors.

    Args:
        majority_point (np.ndarray): A single data point from the majority class.
        X (np.ndarray): The complete dataset.
        y (np.ndarray): The corresponding labels.
        k (int): The number of nearest neighbors to consider.

    Returns:
        str: The zone of the majority point ("safe", "dangerous", or "noise").
    """
    distances = np.array([euclidean(majority_point, x) for x in X])
    # Get indices of sorted distances, excluding the point itself
    nearest_indices = np.argsort(distances)[1:k+1]
    nearest_labels = y[nearest_indices]
    minority_class = 1 # Assuming minority class is labeled as 1
    majority_class = 0 # Assuming majority class is labeled as 0

    minority_neighbors_count = np.sum(nearest_labels == minority_class)
    majority_neighbors_count = np.sum(nearest_labels == majority_class)

    # Classification logic for majority points:
    # Noise: Points primarily surrounded by minority neighbors (near minority core)
    if minority_neighbors_count >= k * 0.8: # Consider a point noise if at least 80% of neighbors are minority
         return "noise"
    # Safe: Points with a high proportion of majority neighbors (away from minority)
    elif majority_neighbors_count >= k * 0.8: # Consider a point safe if at least 80% of neighbors are majority
         return "safe"
    # Dangerous: Points near the boundary, with a mix of majority and minority neighbors
    else:
        return "dangerous"

### Calculating and Plotting Minority Zone Classifications

In [None]:
# Calculate zone classifications for k = 4 with the updated data and logic
minority_indices = np.where(y == 1)[0]
zone_classifications = {}
k = 4
zones_for_k = []

# Convert X to a NumPy array for consistent indexing
X_np = X.values

for i in minority_indices:
    # Access the row by integer position using NumPy indexing from the NumPy array X_np
    minority_point = X_np[i]
    # Pass the NumPy array X_np to the classification function
    zone = classify_minority_point_zone(minority_point, X_np, y, k)
    zones_for_k.append(zone)
zone_classifications[k] = zones_for_k

# Plot the results for k = 4
for k, zones in zone_classifications.items():
    plt.figure(figsize=(10, 6)) # Increased figure size to accommodate legend outside
    # Plot majority class (using the first two columns of the modified X_np)
    # Ensure there are at least two columns before plotting
    if X_np.shape[1] >= 2:
        plt.scatter(X_np[y == 0, 0], X_np[y == 0, 1], color='blue', label='Class 0', alpha=0.3)
    else:
        print("Warning: X_np has less than 2 columns. Cannot plot features X1 and X2.")
        # Optionally, you could break or plot a different way if needed
        continue


    # Plot minority class points based on zone (using the first two columns of the modified X_np)
    minority_indices = np.where(y == 1)[0]
    for i, zone in enumerate(zones):
        # Access the point using NumPy indexing from X_np
        minority_point = X_np[minority_indices[i]]
        # Ensure the minority point has at least two elements before plotting
        if len(minority_point) >= 2:
            if zone == "safe":
                plt.scatter(minority_point[0], minority_point[1], color='green', label='Minority - Safe', alpha=0.8, edgecolors='black')
            elif zone == "dangerous":
                plt.scatter(minority_point[0], minority_point[1], color='orange', label='Minority - Dangerous', alpha=0.8, edgecolors='black')
            else: # noise
                plt.scatter(minority_point[0], minority_point[1], color='red', label='Minority - Noise', alpha=0.8, edgecolors='black')
        else:
            print(f"Warning: Minority point at index {minority_indices[i]} has less than 2 features. Cannot plot features X1 and X2.")


    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))

    # Add text annotations for counts at the bottom right outside the plot
    safe_count = zones.count("safe")
    dangerous_count = zones.count("dangerous")
    noise_count = zones.count("noise")

    plt.text(1.15, 0.08, f'Safe: {safe_count}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='bottom')
    plt.text(1.15, 0.04, f'Dangerous: {dangerous_count}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='bottom')
    plt.text(1.15, 0.00, f'Noise: {noise_count}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='bottom')


    plt.legend(by_label.values(), by_label.keys(), bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside


    plt.title(f"Scatter Plot with Minority Zones for k = {k}")
    plt.xlabel("Feature X1")
    plt.ylabel("Feature X2")
    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for text on the right
    plt.show()

### Splitting Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### Applying Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_test=sc.fit_transform(X_test)

### Training MLP Model and Getting Minority Confidence Scores

In [None]:
from sklearn.neural_network import MLPClassifier

# Train an MLP model on the training data
mlp_model = MLPClassifier(random_state=42, max_iter=1000)
mlp_model.fit(X_train, y_train)

# Get confidence scores (probabilities) for the training data
confidence_scores = mlp_model.predict_proba(X_train)

Extracting Minority Confidence Scores

In [None]:
# The confidence score for the minority class (class 1) is in the second column
minority_confidence = confidence_scores[:, 1]

# Display the first few confidence scores
print("First 10 minority confidence scores:", minority_confidence[:10])

### Extracting Majority Confidence Scores

In [None]:
# The confidence score for the majority class (class 0) is in the first column
majority_confidence = confidence_scores[:, 0]

# Display the first few majority confidence scores
print("First 10 majority confidence scores:", majority_confidence[:10])

### Classifying Minority Training Points into Zones

In [None]:
# Classify minority points in the training data by zone (k=4)
minority_train_indices = np.where(y_train == 1)[0]
zones_train_k4 = []
k_for_zones = 4 # Use k=4 for zone classification

# Convert X_train to a NumPy array for consistent indexing
# X_train_np = X_train.values # Removed .values
X_train_np = X_train # X_train is already a numpy array after scaling

for i in minority_train_indices:
    # Access the row by integer position using NumPy indexing from the NumPy array X_train_np
    minority_point = X_train_np[i]
    zone = classify_minority_point_zone(minority_point, X_train_np, y_train, k_for_zones) # Pass X_train_np as a numpy array
    zones_train_k4.append(zone)

# Display the first few zone classifications
print("First 10 minority training points zone classifications:", zones_train_k4[:10])

### Classifying Majority Training Points into Zones

In [None]:
# Classify majority points in the training data by zone (k=4)
majority_train_indices = np.where(y_train == 0)[0]
zones_majority_train_k4 = []
k_for_zones = 4 # Use k=4 for zone classification

# Convert X_train to a NumPy array for consistent indexing
X_train_np = X_train # X_train is already a numpy array after scaling

for i in majority_train_indices:
    # Access the row by integer position using NumPy indexing from the NumPy array X_train_np
    majority_point = X_train_np[i]
    zone = classify_majority_point_zone(majority_point, X_train_np, y_train, k_for_zones) # Pass X_train_np as a numpy array
    zones_majority_train_k4.append(zone)

# Display the first few zone classifications
print("First 10 majority training points zone classifications:", zones_majority_train_k4[:10])

### Creating DataFrame for Minority Points Info

In [None]:
# Create a DataFrame to easily manage minority points, zones, and confidence
# Select minority rows from X_train directly using a boolean mask
minority_train_data = X_train[y_train == 1]
minority_train_indices_in_train = np.where(y_train == 1)[0] # Get indices relative to X_train

# Ensure zones_train_k4 is available (from previous step)
# If the previous cell was not run, you might need to run it here or ensure zones_train_k4 is populated
# For demonstration, let's assume zones_train_k4 is available

# Ensure the length of zones_train_k4 matches the number of minority training points
if len(zones_train_k4) != len(minority_train_data):
    print("Mismatch between number of minority training points and zone classifications. Rerun the previous cell.")
else:
    minority_info_train = pd.DataFrame({
        # Use the integer indices from where the minority points were located in X_train
        'Original_Train_Index': minority_train_indices_in_train,
        'Zone': zones_train_k4,
        'Confidence': minority_confidence[minority_train_indices_in_train], # Use indices relative to y_train
        'Feature_X1': minority_train_data[:, 0], # Use NumPy indexing for column selection
        'Feature_X2': minority_train_data[:, 1]  # Use NumPy indexing for column selection
    })

    # Display the first few rows of the combined information
    display(minority_info_train.head())

### Creating DataFrame for Majority Points Info

In [None]:
# Create a DataFrame to easily manage majority points, zones, and confidence
# Select majority rows from X_train directly using a boolean mask
majority_train_data = X_train[y_train == 0]
majority_train_indices_in_train = np.where(y_train == 0)[0] # Get indices relative to X_train

# Ensure zones_majority_train_k4 is available (from previous step)
# If the previous cell was not run, you might need to run it here or ensure zones_majority_train_k4 is populated
# For demonstration, let's assume zones_majority_train_k4 is available

# Ensure the length of zones_majority_train_k4 matches the number of majority training points
if len(zones_majority_train_k4) != len(majority_train_data):
    print("Mismatch between number of majority training points and zone classifications. Rerun the previous cell.")
else:
    majority_info_train = pd.DataFrame({
        # Use the integer indices from where the majority points were located in X_train
        'Original_Train_Index': majority_train_indices_in_train,
        'Zone': zones_majority_train_k4,
        'Confidence': majority_confidence[majority_train_indices_in_train], # Use indices relative to y_train
        'Feature_X1': majority_train_data[:, 0], # Use NumPy indexing for column selection by integer position
        'Feature_X2': majority_train_data[:, 1]  # Use NumPy indexing for column selection by integer position
    })

    # Display the first few rows of the combined information
    display(majority_info_train.head())

### Defining Probability Threshold

In [None]:
probability_threshold = 0.7

### Calculating Weighted Average Confidence Ratios and Combining Information

In [None]:
from scipy.spatial.distance import euclidean
import numpy as np # Added import
import pandas as pd # Added import

def calculate_weighted_average_confidence_ratio(point_index, X, y, k, confidence_scores):
    """
    Calculates the weighted average confidence score ratio for a given point
    based on its k-nearest neighbors' weighted average confidence scores and distances.

    Args:
        point_index (int): The index of the point in the dataset X.
        X (np.ndarray): The complete dataset.
        y (np.ndarray): The corresponding labels.
        k (int): The number of nearest neighbors to consider.
        confidence_scores (np.ndarray): Array of confidence scores (minority class probability) for each point.

    Returns:
        tuple: A tuple containing:
            - weighted_average_majority_confidence (float): Weighted average majority neighbor confidence score.
            - weighted_average_minority_confidence (float): Weighted average minority neighbor confidence score.
            - ratio (float): Weighted average confidence score ratio (or np.inf if weighted average minority confidence is zero).
    """
    point = X[point_index]
    distances = np.array([euclidean(point, x) for x in X])

    # Get indices of sorted distances, excluding the point itself
    nearest_indices = np.argsort(distances)[1:k+1]

    weighted_majority_confidence_sum = 0.0
    sum_majority_weights = 0.0
    weighted_minority_confidence_sum = 0.0
    sum_minority_weights = 0.0
    epsilon = 1e-8 # Small value to avoid division by zero for inverse distance

    for neighbor_index in nearest_indices:
        neighbor_class = y[neighbor_index]
        neighbor_confidence = confidence_scores[neighbor_index, 1] # Confidence for minority class
        distance = distances[neighbor_index]

        # Use inverse distance as weight
        weight = 1 / (distance + epsilon)

        if neighbor_class == 0: # Majority class
            # For majority neighbors, we're interested in their confidence in the MAJORITY class (1 - minority confidence)
            weighted_majority_confidence_sum += weight * (1 - neighbor_confidence)
            sum_majority_weights += weight
        else: # Minority class
            # For minority neighbors, we're interested in their confidence in the MINORITY class
            weighted_minority_confidence_sum += weight * neighbor_confidence
            sum_minority_weights += weight

    # Calculate weighted averages, handling division by zero for sums of weights
    weighted_average_majority_confidence = weighted_majority_confidence_sum / (sum_majority_weights + epsilon)
    weighted_average_minority_confidence = weighted_minority_confidence_sum / (sum_minority_weights + epsilon)


    # Calculate the ratio of weighted averages, handling division by zero
    if weighted_average_minority_confidence == 0:
        ratio = np.inf
    else:
        ratio = weighted_average_majority_confidence / weighted_average_minority_confidence

    return weighted_average_majority_confidence, weighted_average_minority_confidence, ratio

# Calculate weighted average confidence ratios for all training points
all_train_indices = np.arange(len(X_train))
weighted_avg_ratios_train = []
weighted_avg_majority_confidences = []
weighted_avg_minority_confidences = []


# Need confidence scores for all training points
all_train_confidence_scores = mlp_model.predict_proba(X_train)

# Convert X_train to a NumPy array for use in the calculate_weighted_average_confidence_ratio function
# X_train_np = X_train.values # Removed .values
X_train_np = X_train


for i in range(len(X_train)):
    # Pass the NumPy array X_train_np to the classification function
    maj_avg_conf, min_avg_conf, ratio = calculate_weighted_average_confidence_ratio(i, X_train_np, y_train, k_for_zones, all_train_confidence_scores)
    weighted_avg_majority_confidences.append(maj_avg_conf)
    weighted_avg_minority_confidences.append(min_avg_conf)
    weighted_avg_ratios_train.append(ratio)

# Combine all information into a DataFrame
# Ensure zones_train_k4 and zones_majority_train_k4 are correctly ordered and combined
# The zones were calculated for minority and majority points separately, need to merge them based on original index

# Create a mapping from original train index to zone
zone_mapping = {}
minority_train_indices = np.where(y_train == 1)[0]
majority_train_indices = np.where(y_train == 0)[0]

# Assuming zones_train_k4 corresponds to minority_train_indices
for original_idx, zone in zip(minority_train_indices, zones_train_k4):
    zone_mapping[original_idx] = zone

# Assuming zones_majority_train_k4 corresponds to majority_train_indices
for original_idx, zone in zip(majority_train_indices, zones_majority_train_k4):
     zone_mapping[original_idx] = zone

# Get zones in the order of all_train_indices
all_zones_train = [zone_mapping[i] for i in all_train_indices]

# Get individual confidence scores in the order of all_train_indices
all_individual_confidence = all_train_confidence_scores[:, 1] # Confidence for minority class

# Add a column for the original class label (Majority or Minority)
original_classes = ['Minority' if label == 1 else 'Majority' for label in y_train[all_train_indices]]

train_points_info_updated = pd.DataFrame({
    'Original_Train_Index': all_train_indices,
    'Original_Class': original_classes, # New column for class label
    'Zone': all_zones_train,
    'Individual_Confidence_Minority': all_individual_confidence,
    'Weighted_Average_Majority_Neighbor_Confidence': weighted_avg_majority_confidences,
    'Weighted_Average_Minority_Neighbor_Confidence': weighted_avg_minority_confidences,
    'Weighted_Average_Confidence_Ratio': weighted_avg_ratios_train
})

# Display the resulting DataFrame
display(train_points_info_updated.head())

### Applying Filtering Rules and Creating Filtered Dataset

In [None]:
from scipy.spatial.distance import euclidean # Added import
import numpy as np # Added import
import pandas as pd # Added import

# Define the probability threshold and calculate the ratio threshold
probability_threshold = 0.55 # Ensure threshold is defined
ratio_threshold = probability_threshold / (1 - probability_threshold)
print(f"Using probability threshold: {probability_threshold}")
print(f"Calculated ratio threshold: {ratio_threshold}")

# Ensure k_for_zones is defined
k_for_zones = 4 # Ensure k_for_zones is defined


# Initialize sets to store indices of points to be eliminated by each rule
# elimination_indices_rule1 = set() # Removed Rule 1
elimination_indices_rule2 = set()
elimination_indices_rule3 = set() # Keep the set for clarity, though it won't be populated by Rule 3 logic
# elimination_indices_rule4 = set() # Removed Rule 4
elimination_indices_rule5 = set()
elimination_indices_rule6 = set() # New set for the new rule

# Initialize counters for eliminated points per rule
# eliminated_majority_rule1_count = 0 # Removed Rule 1
# eliminated_minority_rule1_count = 0 # Removed Rule 1

eliminated_majority_rule2_count = 0
eliminated_minority_rule2_count = 0 # Rule 2 eliminates majority points

eliminated_majority_rule3_count = 0 # Keep the counter, though it won't be incremented by Rule 3 logic
eliminated_minority_rule3_count = 0 # Rule 3 eliminates dangerous majority and sometimes majority neighbors

# eliminated_majority_rule4_count = 0 # Removed Rule 4
# eliminated_minority_rule4_count = 0 # Removed Rule 4

eliminated_majority_rule5_count = 0 # Rule 5 eliminates dangerous minority points
eliminated_minority_rule5_count = 0

eliminated_majority_rule6_count = 0 # Counters for the new rule
eliminated_minority_rule6_count = 0


# Iterate through each point in the training data
for index, row in train_points_info_updated.iterrows():
    original_train_index = row['Original_Train_Index']
    original_class = row['Original_Class']
    zone = row['Zone']
    individual_confidence_minority = row['Individual_Confidence_Minority']
    weighted_avg_confidence_ratio = row['Weighted_Average_Confidence_Ratio']

    # Calculate individual confidence for Majority class
    individual_confidence_majority = 1 - individual_confidence_minority

    # # Apply Filtering Rule 1 (Noise Minority - Eliminate Majority Neighbors) - REMOVED
    # if original_class == 'Minority' and zone == 'noise':
    #     # This rule targets noise minority points with LOW majority confidence
    #     if individual_confidence_majority < probability_threshold:
    #         # Find k nearest neighbors in X_train for this noise minority point
    #         point = X_train[original_train_index]
    #         distances = np.array([euclidean(point, x) for x in X_train])
    #         # Get indices of sorted distances, excluding the point itself
    #         nearest_indices = np.argsort(distances)[1:k_for_zones+1]

    #         # Identify Majority neighbors among the k nearest neighbors
    #         majority_neighbors_indices = [
    #             neighbor_index for neighbor_index in nearest_indices
    #             if y_train[neighbor_index] == 0
    #         ]
    #         # Add Majority neighbors to the elimination set for Rule 1
    #         for idx in majority_neighbors_indices:
    #              if idx not in elimination_indices_rule1:
    #                  elimination_indices_rule1.add(idx)
    #                  eliminated_majority_rule1_count += 1


    # Apply Filtering Rule 2 (Noise Majority)
    if original_class == 'Majority' and zone == 'noise':
        # This rule targets noise majority points with LOW majority confidence
        if individual_confidence_majority < (1 - probability_threshold): # Check confidence in majority class
            # Add the noise majority point to the elimination set for Rule 2
            if original_train_index not in elimination_indices_rule2:
                elimination_indices_rule2.add(original_train_index)
                eliminated_majority_rule2_count += 1


    # Apply Filtering Rule 3 (Dangerous Majority)
    if original_class == 'Majority' and zone == 'dangerous':
        # This rule targets dangerous majority points with LOW majority confidence
        # MODIFIED: Only apply if BOTH conditions are met and only eliminate the dangerous majority point
        # MODIFIED CONDITION: individual_confidence_majority < ((1 - probability_threshold) / 2)
        if individual_confidence_majority < ((1 - probability_threshold) / 2) and weighted_avg_confidence_ratio < ratio_threshold:
            # Add the dangerous majority point to the elimination set for Rule 3
            if original_train_index not in elimination_indices_rule3:
                elimination_indices_rule3.add(original_train_index)
                eliminated_majority_rule3_count += 1

            # # Find k nearest neighbors in X_train for this dangerous majority point - REMOVED NEIGHBOR ELIMINATION
            # point = X_train[original_train_index]
            # distances = np.array([euclidean(point, x) for x in X_train])
            # # Get indices of sorted distances, excluding the point itself
            # nearest_indices = np.argsort(distances)[1:k_for_zones+1]

            # # Identify Majority neighbors among the k nearest neighbors - REMOVED NEIGHBOR ELIMINATION
            # majority_neighbors_indices = [
            #     neighbor_index for neighbor_index in nearest_indices
            #     if y_train[neighbor_index] == 0
            # ]
            # # Add Majority neighbors to the elimination set for Rule 3 - REMOVED NEIGHBOR ELIMINATION
            # for idx in majority_neighbors_indices:
            #     if idx not in elimination_indices_rule3:
            #         elimination_indices_rule3.add(idx)
            #         eliminated_majority_rule3_count += 1

            # # REMOVED: Else block for eliminating only the dangerous majority point
            # else:
            #     # Add only the dangerous majority point to the elimination set for Rule 3
            #      if original_train_index not in elimination_indices_rule3:
            #         elimination_indices_rule3.add(idx)
            #         eliminated_majority_rule3_count += 1


    # Apply Filtering Rule 5 (Dangerous Minority - Eliminate Dangerous Minority Point)
    if original_class == 'Minority' and zone == 'dangerous':
        # This rule targets dangerous minority points with LOW minority confidence
        # MODIFIED CONDITION: individual_confidence_minority < (1 - probability_threshold) / 2
        if individual_confidence_minority < (1 - probability_threshold) / 2: # Check confidence in minority class
             if weighted_avg_confidence_ratio > ratio_threshold:
                # Add the dangerous minority point to the elimination set for Rule 5
                if original_train_index not in elimination_indices_rule5:
                    elimination_indices_rule5.add(original_train_index)
                    eliminated_minority_rule5_count += 1

    # Apply Filtering Rule 6 (Noise Minority - Eliminate Noise Minority Point) - NEW RULE
    if original_class == 'Minority' and zone == 'noise':
        # This rule targets noise minority points with HIGH majority confidence
        # MODIFIED CONDITION: individual_confidence_majority > (probability_threshold + ((1 - probability_threshold) / 2))
        if individual_confidence_majority > (probability_threshold + ((1 - probability_threshold) / 2)): # Check confidence in majority class
            # Add the noise minority point to the elimination set for Rule 6
            if original_train_index not in elimination_indices_rule6:
                elimination_indices_rule6.add(original_train_index)
                eliminated_minority_rule6_count += 1


# Combine all eliminated indices into a single set to get unique indices
# Exclude elimination_indices_rule3 from the total elimination set
elimination_indices_total = (
    # elimination_indices_rule1 | # Removed Rule 1
    elimination_indices_rule2 |
    elimination_indices_rule3 | # Include Rule 3 in total again
    elimination_indices_rule5 |
    elimination_indices_rule6 # Include the new rule's elimination set
)

# Convert the set of indices to a sorted list
elimination_indices_list = sorted(list(elimination_indices_total))

print(f"\n--- Elimination Summary by Rule ---") # Updated message
# print(f"Rule 1 (Noise Minority, Low Majority Conf): Eliminated {eliminated_majority_rule1_count} Majority neighbors") # Removed Rule 1
print(f"Rule 2 (Noise Majority, Low Majority Conf): Eliminated {eliminated_majority_rule2_count} Majority points")
print(f"Rule 3 (Dangerous Majority, Very Low Majority Conf & Low Ratio): Eliminated {eliminated_majority_rule3_count} Majority points") # Updated message
# print(f"Rule 4 (Dangerous Minority, High Minority Conf): Eliminated {eliminated_majority_rule4_count} Majority neighbors") # Removed Rule 4
print(f"Rule 5 (Dangerous Minority, Very Low Minority Conf & High Ratio): Eliminated {eliminated_minority_rule5_count} Minority points") # Updated description
print(f"Rule 6 (Noise Minority, Very High Majority Conf): Eliminated {eliminated_minority_rule6_count} Minority points") # Print count for the new rule and updated description


print(f"\nTotal unique points identified for elimination: {len(elimination_indices_list)}")
print(f"Indices of total unique points to be eliminated (first 20): {elimination_indices_list[:20]}...")


# Create the filtered dataset
# Use a boolean mask to select points NOT in the total elimination list
all_train_indices = np.arange(len(X_train))
keep_mask = np.isin(all_train_indices, elimination_indices_list, invert=True)

X_filtered = X_train[keep_mask]
y_filtered = y_train[keep_mask]

print(f"\nOriginal training data shape: {X_train.shape}")
print(f"Filtered training data shape: {X_filtered.shape}")

# Display the class distribution of the filtered data
unique, counts = np.unique(y_filtered, return_counts=True)
filtered_class_distribution = dict(zip(unique, counts))
print(f"Filtered training data class distribution: {filtered_class_distribution}")

# Calculate the number of eliminated points for each class from the total filtered dataset
unique_original, counts_original = np.unique(y_train, return_counts=True)
original_class_distribution = dict(zip(unique_original, counts_original))

eliminated_majority_total = original_class_distribution.get(0.0, 0) - filtered_class_distribution.get(0.0, 0)
eliminated_minority_total = original_class_distribution.get(1.0, 0) - filtered_class_distribution.get(1.0, 0)

print(f"\nTotal Majority points eliminated (unique): {eliminated_majority_total}")
print(f"Total Minority points eliminated (unique): {eliminated_minority_total}")

### Plotting Original Training Data with Eliminated Points

In [None]:
plt.figure(figsize=(10, 6))

# Convert X_train to a NumPy array for consistent indexing
# X_train_np = X_train.values
X_train_np = X_train # X_train is already a numpy array after scaling

# Plot original Majority training points
plt.scatter(X_train_np[y_train == 0, 0], X_train_np[y_train == 0, 1], color='blue', alpha=0.3, label='Original Majority')

# Plot original Minority training points
plt.scatter(X_train_np[y_train == 1, 0], X_train_np[y_train == 1, 1], color='red', alpha=0.8, label='Original Minority')

# Highlight the eliminated Majority points
eliminated_majority_indices = [i for i in elimination_indices_list if y_train[i] == 0]
eliminated_majority_X = X_train_np[eliminated_majority_indices]
eliminated_majority_y = y_train[eliminated_majority_indices]
plt.scatter(eliminated_majority_X[:, 0], eliminated_majority_X[:, 1], color='black', marker='X', s=100, label='Eliminated Majority', edgecolors='white', linewidth=1.5) # Use black 'X' for eliminated Majority

# Highlight the eliminated Minority points
eliminated_minority_indices = [i for i in elimination_indices_list if y_train[i] == 1]
eliminated_minority_X = X_train_np[eliminated_minority_indices]
eliminated_minority_y = y_train[eliminated_minority_indices]
plt.scatter(eliminated_minority_X[:, 0], eliminated_minority_X[:, 1], color='green', marker='X', s=100, label='Eliminated Minority', edgecolors='black', linewidth=1.5) # Use green 'X' for eliminated Minority


plt.title("Original Training Data with Eliminated Points Highlighted by Class")
plt.xlabel("Feature X1")
plt.ylabel("Feature X2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside
# plt.grid(True) # Removed grid lines
plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend on the right
plt.show()

### Plotting Filtered Training Data

In [None]:
plt.figure(figsize=(10, 6))

# Plot filtered Majority points
plt.scatter(X_filtered[y_filtered == 0, 0], X_filtered[y_filtered == 0, 1], color='blue', alpha=0.6, label='Filtered Majority')

# Plot filtered Minority points
plt.scatter(X_filtered[y_filtered == 1, 0], X_filtered[y_filtered == 1, 1], color='red', alpha=0.8, label='Filtered Minority')

plt.title("Filtered Training Data Points")
plt.xlabel("Feature X1")
plt.ylabel("Feature X2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside
# plt.grid(True) # Removed grid lines
plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend on the right
plt.show()

### Recalculating Zone Classifications and Confidence for Filtered Data

In [None]:
# Apply zone classification to the filtered training data (X_filtered, y_filtered)
# We need to classify both minority and majority points in the filtered dataset

# Get the original training indices for the filtered data points
# This requires accessing the indices that were NOT in the elimination list
all_train_indices = np.arange(len(X_train))
# The keep_mask was generated in the filtering step (cell bc4d7e46)
# Assuming keep_mask is still available from previous execution
# If not, we would need to regenerate it or access the eliminated indices list again
keep_mask = np.isin(all_train_indices, elimination_indices_list, invert=True)
original_indices_filtered = all_train_indices[keep_mask]


zones_filtered = []
k_for_zones_filtered = 4 # Use the same k for zone classification as before

# Convert X_filtered to a NumPy array for use in classification functions
# X_filtered_np = X_filtered.values # Removed .values
X_filtered_np = X_filtered


# Classify minority points in the filtered data
minority_filtered_indices_in_filtered = np.where(y_filtered == 1)[0]
for i in minority_filtered_indices_in_filtered:
    # Access the row by integer position using NumPy indexing from the numpy array X_filtered
    minority_point = X_filtered[i]
    # We need to use the *filtered* data (X_filtered_np, y_filtered) for KNN calculation
    zone = classify_minority_point_zone(minority_point, X_filtered_np, y_filtered, k_for_zones_filtered)
    # Get the corresponding original training index
    original_idx = original_indices_filtered[i]
    zones_filtered.append({'Filtered_Index': i, 'Original_Train_Index': original_idx, 'Original_Class': 'Minority', 'Zone': zone})


# Classify majority points in the filtered data
majority_filtered_indices_in_filtered = np.where(y_filtered == 0)[0]
for i in majority_filtered_indices_in_filtered:
    # Access the row by integer position using NumPy indexing from the numpy array X_filtered
    majority_point = X_filtered[i]
     # We need to use the *filtered* data (X_filtered_np, y_filtered) for KNN calculation
    zone = classify_majority_point_zone(majority_point, X_filtered_np, y_filtered, k_for_zones_filtered)
     # Get the corresponding original training index
    original_idx = original_indices_filtered[i]
    zones_filtered.append({'Filtered_Index': i, 'Original_Train_Index': original_idx, 'Original_Class': 'Majority', 'Zone': zone})

# Convert the results to a DataFrame for easier handling
zones_filtered_df = pd.DataFrame(zones_filtered)

# --- Recalculate confidence scores using a model trained on the filtered data ---

# Train a new MLP model on the filtered training data
mlp_model_filtered = MLPClassifier(random_state=42, max_iter=1000)
mlp_model_filtered.fit(X_filtered, y_filtered)

# Get confidence scores (probabilities) for the filtered data using the filtered model
filtered_confidence_scores = mlp_model_filtered.predict_proba(X_filtered)

# Get individual confidence scores for minority and majority classes in the filtered data
filtered_individual_confidence_minority = filtered_confidence_scores[:, 1]
filtered_individual_confidence_majority = filtered_confidence_scores[:, 0]

# Add the new confidence scores to the zones_filtered_df DataFrame
zones_filtered_df['Individual_Confidence_Minority'] = filtered_individual_confidence_minority
zones_filtered_df['Individual_Confidence_Majority'] = filtered_individual_confidence_majority

# --- Add Minority Subgroup column to zones_filtered_df ---
# This information is needed later for adaptive alpha calculation
def categorize_minority_group_filtered(row):
    if row['Original_Class'] == 'Majority':
        return 'Majority' # Or None, or another indicator for non-minority
    elif row['Zone'] == 'dangerous':
        return 'Dangerous Minority'
    elif row['Zone'] == 'safe' and row['Individual_Confidence_Minority'] < probability_threshold:
        return 'Safe Minority Low Conf'
    elif row['Zone'] == 'safe' and row['Individual_Confidence_Minority'] >= probability_threshold:
        return 'Safe Minority High Conf'
    elif row['Zone'] == 'noise':
         return 'Noise Minority'
    else:
        return 'Other Minority' # Should not happen


zones_filtered_df['Minority_Subgroup'] = zones_filtered_df.apply(categorize_minority_group_filtered, axis=1)


# --- Recalculate Weighted Average Confidence Ratio using the filtered confidence scores ---

weighted_avg_ratios_filtered = []
weighted_avg_majority_confidences_filtered = []
weighted_avg_minority_confidences_filtered = []

# Convert X_filtered to a NumPy array for use in the calculate_weighted_average_confidence_ratio function
# X_filtered_np = X_filtered.values # Removed .values
X_filtered_np = X_filtered

# Iterate through each point in the filtered data to calculate weighted average confidence ratio
for i in range(len(X_filtered)):
    # Use the calculate_weighted_average_confidence_ratio function with filtered data and filtered confidence scores
    maj_avg_conf, min_avg_conf, ratio = calculate_weighted_average_confidence_ratio(
        i, X_filtered_np, y_filtered, k_for_zones_filtered, filtered_confidence_scores
    )
    weighted_avg_majority_confidences_filtered.append(maj_avg_conf)
    weighted_avg_minority_confidences_filtered.append(min_avg_conf)
    weighted_avg_ratios_filtered.append(ratio)

# Add the new weighted average confidence information to the zones_filtered_df DataFrame
zones_filtered_df['Weighted_Average_Majority_Neighbor_Confidence'] = weighted_avg_majority_confidences_filtered
zones_filtered_df['Weighted_Average_Minority_Neighbor_Confidence'] = weighted_avg_minority_confidences_filtered
zones_filtered_df['Weighted_Average_Confidence_Ratio'] = weighted_avg_ratios_filtered


# Rename zones_filtered_df to final_filtered_info_df to be consistent with subsequent steps
final_filtered_info_df = zones_filtered_df

# Reorder columns for clarity
final_filtered_info_df = final_filtered_info_df[[
    'Filtered_Index',
    'Original_Train_Index',
    'Original_Class',
    'Zone',
    'Minority_Subgroup', # Include the new column
    'Individual_Confidence_Minority',
    'Individual_Confidence_Majority',
    'Weighted_Average_Majority_Neighbor_Confidence',
    'Weighted_Average_Minority_Neighbor_Confidence',
    'Weighted_Average_Confidence_Ratio'
]]


# Display the first few rows and the count of points in each zone
display(final_filtered_info_df.head())

print("\nFiltered data zone distribution:")
print(final_filtered_info_df['Zone'].value_counts())

print("\nFiltered data minority subgroup distribution:")
print(final_filtered_info_df['Minority_Subgroup'].value_counts())


print("\nColumns in final_filtered_info_df:")
print(final_filtered_info_df.columns.tolist())

### Analyzing Minority Zone Counts in Filtered Data

In [None]:
NOT REQUIRED

### Counting Safe Minority Points with Low Confidence

In [None]:
NOT REQUIRED

### Allocating Synthetic Samples Based on Inverse Ratio

In [None]:
NOT REQUIRED

### Allocating Synthetic Samples Based on Inverse Density and Confidence

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean # Import euclidean for Kth_Neighbor_Distance calculation

# Ensure final_filtered_info_df is available (from cell 8d27e5ab, now updated with recalculated confidence and ratio)
# Ensure oversample_from_safe_low_conf_count is available (total synthetic samples needed from safe low confidence, from cell nDcFANnxD9tX)
# Ensure oversample_from_dangerous_count is available (total samples for dangerous minority, from cell nDcFANnxD9tX)
# Note: The previous allocation split was based on 'Safe Minority < Threshold' and 'Dangerous Minority'.
# We will now distribute the allocated samples within each of these groups based on inverse density.

# --- Create safe_minority_info_df and calculate Kth_Neighbor_Distance ---
# Filter the DataFrame to include only safe minority points with confidence < probability_threshold
# Use the 'Individual_Confidence_Minority' which has been recalculated on the filtered data
safe_minority_low_conf_df = final_filtered_info_df[
    (final_filtered_info_df['Original_Class'] == 'Minority') &
    (final_filtered_info_df['Zone'] == 'safe') &
    (final_filtered_info_df['Individual_Confidence_Minority'] < probability_threshold)
].copy() # Use .copy() to avoid SettingWithCopyWarning


# Calculate the distance to the k-th nearest neighbor for each point in safe_minority_low_conf_df
k_for_density = 4 # Use the same k as for zone classification or adjust as needed

kth_neighbor_distances_safe = []
for index, row in safe_minority_low_conf_df.iterrows():
    filtered_idx = int(row['Filtered_Index'])
    point = X_filtered[filtered_idx] # Get the point from the filtered data using NumPy indexing

    # Calculate distances to all other points in X_filtered
    distances = np.array([euclidean(point, X_filtered[other_filtered_idx]) for other_filtered_idx in range(len(X_filtered))])


    # Sort distances and get the k-th smallest distance (excluding the distance to itself)
    sorted_distances = np.sort(distances)
    # Ensure k_for_density is less than the number of points in X_filtered minus 1 (to exclude self)
    if k_for_density < len(X_filtered):
         # Find the index of the point itself in sorted_distances (it will be 0)
         self_distance_index = np.where(sorted_distances == 0)[0]
         if len(self_distance_index) > 0:
             # If the k-th neighbor is the point itself (distance 0), take the next one
             if k_for_density < len(sorted_distances):
                 kth_distance = sorted_distances[k_for_density] # k-th smallest distance (0-indexed), skipping the 0 distance
             else:
                  # Fallback if not enough points after excluding self
                  kth_distance = sorted_distances[-1] if len(sorted_distances) > 0 else 0.0 # Handle empty case
         else:
              # Should not happen for a point in the dataset, but as a fallback
              kth_distance = sorted_distances[k_for_density -1] # k-th smallest distance (0-indexed)

    else:
         # If k is larger than or equal to the number of points, use the distance to the furthest point (excluding self)
         # Find the distance to the furthest point, excluding the distance to itself
         distances_excluding_self = distances[distances > 0]
         kth_distance = np.max(distances_excluding_self) if len(distances_excluding_self) > 0 else 0.0


    kth_neighbor_distances_safe.append(kth_distance)

# Add the Kth_Neighbor_Distance as a new column to safe_minority_low_conf_df
safe_minority_low_conf_df['Kth_Neighbor_Distance'] = kth_neighbor_distances_safe

# --- Inverse Allocation Logic based on Density (using Kth_Neighbor_Distance) for Safe Minority (< Threshold) ---

# Get the distances (inverse density) for safe minority points with low confidence
distances_safe = safe_minority_low_conf_df['Kth_Neighbor_Distance'].values

# Calculate allocation weights inversely proportional to distance.
epsilon = 1e-6 # Small value to avoid division by zero
inverse_distances_safe = 1 / (distances_safe + epsilon)

# The weights for allocation should be proportional to these inverse distances
allocation_weights_safe = inverse_distances_safe

# Normalize the weights so they sum to 1
# Handle the case where there are no safe minority points with low confidence
if np.sum(allocation_weights_safe) > 0:
    normalized_weights_safe = allocation_weights_safe / np.sum(allocation_weights_safe)
else:
    normalized_weights_safe = np.zeros_like(allocation_weights_safe) # No weights if no points


# Get the total number of synthetic samples allocated to the 'Safe Minority < Threshold' group
total_synthetic_from_safe_low_conf_group = oversample_from_safe_low_conf_count

# Calculate the number of samples to generate from each safe minority point in this group
num_samples_to_generate_per_point_safe = np.round(normalized_weights_safe * total_synthetic_from_safe_low_conf_group).astype(int)

# Adjust for potential rounding errors to ensure the total matches
total_generated_safe = np.sum(num_samples_to_generate_per_point_safe)
difference_safe = total_synthetic_from_safe_low_conf_group - total_generated_safe

if difference_safe != 0 and len(safe_minority_low_conf_df) > 0:
    # Get indices of points with highest weights (in descending order)
    highest_weight_indices_safe = np.argsort(normalized_weights_safe)[::-1]
    for i in range(abs(difference_safe)):
        # Use modulo to cycle through the highest weight points if difference is larger than the number of points
        target_index_in_safe_df = highest_weight_indices_safe[i % len(highest_weight_indices_safe)]
        if difference_safe > 0:
            num_samples_to_generate_per_point_safe[target_index_in_safe_df] += 1
        else:
            # Decrement only if the current count is greater than 0 to avoid negative samples
            if num_samples_to_generate_per_point_safe[target_index_in_safe_df] > 0:
                 num_samples_to_generate_per_point_safe[target_index_in_safe_df] -= 1


# Add the number of samples to generate as a new column in safe_minority_low_conf_df
safe_minority_low_conf_df['Samples_to_Generate'] = num_samples_to_generate_per_point_safe

print(f"Total synthetic samples to generate from Safe Minority (< Threshold) group: {total_synthetic_from_safe_low_conf_group}")
print(f"Total samples allocated based on density for Safe Minority (< Threshold): {np.sum(safe_minority_low_conf_df['Samples_to_Generate'])}")


# --- Create dangerous_minority_df and calculate Kth_Neighbor_Distance ---
# Filter the DataFrame to include only dangerous minority points
dangerous_minority_df = final_filtered_info_df[
    (final_filtered_info_df['Original_Class'] == 'Minority') &
    (final_filtered_info_df['Zone'] == 'dangerous')
].copy() # Use .copy() to avoid SettingWithCopyWarning

# Calculate the distance to the k-th nearest neighbor for each point in dangerous_minority_df
kth_neighbor_distances_dangerous = []
for index, row in dangerous_minority_df.iterrows():
    filtered_idx = int(row['Filtered_Index'])
    point = X_filtered[filtered_idx] # Get the point from the filtered data using NumPy indexing

    # Calculate distances to all other points in X_filtered
    distances = np.array([euclidean(point, X_filtered[other_filtered_idx]) for other_filtered_idx in range(len(X_filtered))])


    # Sort distances and get the k-th smallest distance (excluding the distance to itself)
    sorted_distances = np.sort(distances)
    # Ensure k_for_density is less than the number of points in X_filtered minus 1 (to exclude self)
    if k_for_density < len(X_filtered):
         # Find the index of the point itself in sorted_distances (it will be 0)
         self_distance_index = np.where(sorted_distances == 0)[0]
         if len(self_distance_index) > 0:
             # If the k-th neighbor is the point itself (distance 0), take the next one
             if k_for_density < len(sorted_distances):
                 kth_distance = sorted_distances[k_for_density] # k-th smallest distance (0-indexed), skipping the 0 distance
             else:
                  # Fallback if not enough points after excluding self
                  kth_distance = sorted_distances[-1] if len(sorted_distances) > 0 else 0.0 # Handle empty case

         else:
              # Should not happen for a point in the dataset, but as a fallback
              kth_distance = sorted_distances[k_for_density -1] # k-th smallest distance (0-indexed)


    else:
         # If k is larger than or equal to the number of points, use the distance to the furthest point (excluding self)
         # Find the distance to the furthest point, excluding the distance to itself
         distances_excluding_self = distances[distances > 0]
         kth_distance = np.max(distances_excluding_self) if len(distances_excluding_self) > 0 else 0.0


    kth_neighbor_distances_dangerous.append(kth_distance)

# Add the Kth_Neighbor_Distance as a new column to dangerous_minority_df
dangerous_minority_df['Kth_Neighbor_Distance'] = kth_neighbor_distances_dangerous


# --- Inverse Allocation Logic based on Density (using Kth_Neighbor_Distance) for Dangerous Minority ---

# Get the distances (inverse density) for dangerous minority points
distances_dangerous = dangerous_minority_df['Kth_Neighbor_Distance'].values

# Calculate allocation weights inversely proportional to distance.
inverse_distances_dangerous = 1 / (distances_dangerous + epsilon)

# The weights for allocation should be proportional to these inverse distances
allocation_weights_dangerous = inverse_distances_dangerous

# Normalize the weights so they sum to 1, handling the case where there are no dangerous minority points
if np.sum(allocation_weights_dangerous) > 0:
    normalized_weights_dangerous = allocation_weights_dangerous / np.sum(allocation_weights_dangerous)
else:
    normalized_weights_dangerous = np.zeros_like(allocation_weights_dangerous) # No weights if no points


# Get the total number of synthetic samples allocated to the 'Dangerous Minority' group
total_synthetic_from_dangerous_group = oversample_from_dangerous_count

# Calculate the number of samples to generate from each dangerous minority point
num_samples_to_generate_per_point_dangerous = np.round(normalized_weights_dangerous * total_synthetic_from_dangerous_group).astype(int)

# Adjust for potential rounding errors to ensure the total matches
total_generated_dangerous = np.sum(num_samples_to_generate_per_point_dangerous)
difference_dangerous = total_synthetic_from_dangerous_group - total_generated_dangerous

if difference_dangerous != 0 and len(dangerous_minority_df) > 0:
    # Get indices of points with highest weights (in descending order)
    highest_weight_indices_dangerous = np.argsort(normalized_weights_dangerous)[::-1]
    for i in range(abs(difference_dangerous)):
        # Use modulo to cycle through the highest weight points if difference is larger than the number of points
        target_index_in_dangerous_df = highest_weight_indices_dangerous[i % len(highest_weight_indices_dangerous)]
        if difference_dangerous > 0:
            num_samples_to_generate_per_point_dangerous[target_index_in_dangerous_df] += 1
        else:
             # Decrement only if the current count is greater than 0 to avoid negative samples
            if num_samples_to_generate_per_point_dangerous[target_index_in_dangerous_df] > 0:
                 num_samples_to_generate_per_point_dangerous[target_index_in_dangerous_df] -= 1


# Add the number of samples to generate as a new column in dangerous_minority_df
dangerous_minority_df['Samples_to_Generate'] = num_samples_to_generate_per_point_dangerous

print(f"Total synthetic samples to generate from Dangerous Minority group: {total_synthetic_from_dangerous_group}")
print(f"Total samples allocated based on density for Dangerous Minority: {np.sum(dangerous_minority_df['Samples_to_Generate'])}")


print("\nFirst 5 safe minority points (< Threshold) with their allocated samples to generate:")
display(safe_minority_low_conf_df[['Filtered_Index', 'Original_Train_Index', 'Zone', 'Kth_Neighbor_Distance', 'Samples_to_Generate']].head())

print("\nDistribution of samples to generate across safe minority points (< Threshold):")
allocation_counts_safe = safe_minority_low_conf_df['Samples_to_Generate'].value_counts()
print(allocation_counts_safe[allocation_counts_safe > 0])

print("\nFirst 5 dangerous minority points with their allocated samples to generate:")
display(dangerous_minority_df[['Filtered_Index', 'Original_Train_Index', 'Zone', 'Kth_Neighbor_Distance', 'Samples_to_Generate']].head())

print("\nDistribution of samples to generate across dangerous minority points:")
allocation_counts_dangerous = dangerous_minority_df['Samples_to_Generate'].value_counts()
print(allocation_counts_dangerous[allocation_counts_dangerous > 0])

### Identifying Target Minority Groups for Clustering

In [None]:
# Ensure final_filtered_info_df and probability_threshold are available from previous cells

# Identify Dangerous Minority points
dangerous_minority_points_info = final_filtered_info_df[
    (final_filtered_info_df['Original_Class'] == 'Minority') &
    (final_filtered_info_df['Zone'] == 'dangerous')
].copy() # Use .copy() to avoid SettingWithCopyWarning


# Identify Safe Minority points with low confidence
# Use the 'Individual_Confidence_Minority' which has been recalculated on the filtered data
safe_minority_low_conf_points_info = final_filtered_info_df[
    (final_filtered_info_df['Original_Class'] == 'Minority') &
    (final_filtered_info_df['Zone'] == 'safe') &
    (final_filtered_info_df['Individual_Confidence_Minority'] < probability_threshold)
].copy() # Use .copy() to avoid SettingWithCopyWarning

# Identify Safe Minority points with high confidence
# Use the 'Individual_Confidence_Minority' which has been recalculated on the filtered data
safe_minority_high_conf_points_info = final_filtered_info_df[
    (final_filtered_info_df['Original_Class'] == 'Minority') &
    (final_filtered_info_df['Zone'] == 'safe') &
    (final_filtered_info_df['Individual_Confidence_Minority'] >= probability_threshold)
].copy() # Use .copy() to avoid SettingWithCopyWarning


# Combine the three groups of points into a single DataFrame for clustering
# Keep relevant columns: Filtered_Index is crucial for linking back to X_filtered
target_minority_points_info_three_groups = pd.concat(
    [dangerous_minority_points_info, safe_minority_low_conf_points_info, safe_minority_high_conf_points_info],
    ignore_index=True
)

# Add a column to indicate which of the three groups the point belongs to
def categorize_minority_group(row):
    if row['Zone'] == 'dangerous':
        return 'Dangerous Minority'
    elif row['Zone'] == 'safe' and row['Individual_Confidence_Minority'] < probability_threshold:
        return 'Safe Minority Low Conf'
    elif row['Zone'] == 'safe' and row['Individual_Confidence_Minority'] >= probability_threshold:
        return 'Safe Minority High Conf'
    else:
        return 'Other Minority' # Should not happen for filtered minority points


target_minority_points_info_three_groups['Minority_Subgroup'] = target_minority_points_info_three_groups.apply(categorize_minority_group, axis=1)


print(f"Number of Dangerous Minority points: {len(dangerous_minority_points_info)}")
print(f"Number of Safe Minority with Low Confidence points: {len(safe_minority_low_conf_points_info)}")
print(f"Number of Safe Minority with High Confidence points: {len(safe_minority_high_conf_points_info)}")
print(f"Total target minority points for clustering: {len(target_minority_points_info_three_groups)}")

# Display the distribution across the three subgroups
print("\nDistribution of minority points across the three target subgroups:")
print(target_minority_points_info_three_groups['Minority_Subgroup'].value_counts())


# Display the first few rows of the combined target minority points info
print("\nCombined Target Minority Points Info (first 5 rows):")
display(target_minority_points_info_three_groups.head())

### Preparing Data for Clustering

In [None]:
# Ensure target_minority_points_info_three_groups and X_filtered are available from previous steps

# Get the indices in X_filtered for these target minority points
target_minority_filtered_indices_three_groups = target_minority_points_info_three_groups['Filtered_Index'].values

# Select the corresponding rows from X_filtered
X_target_minority_three_groups = X_filtered[target_minority_filtered_indices_three_groups]

print(f"Shape of X_target_minority_three_groups (features for clustering): {X_target_minority_three_groups.shape}")

### Applying Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt # Import matplotlib for plotting the silhouette scores

# Ensure X_target_minority_three_groups and target_minority_points_info_three_groups are available from previous steps

# --- Determine Optimal Number of Clusters using Silhouette Score ---
# We need to evaluate silhouette scores for a range of possible cluster numbers.
# The minimum number of clusters should be 2.
# The maximum number of clusters should be less than the number of samples.
max_clusters = min(10, X_target_minority_three_groups.shape[0] - 1) # Evaluate up to 10 clusters or (n_samples - 1)

if max_clusters < 2:
    print("Not enough samples to perform clustering with at least 2 clusters.")
    optimal_n_clusters_three_groups = max_clusters # Fallback, though clustering won't be meaningful
    silhouette_avg = -1 # Indicate no meaningful score
    range_n_clusters = [] # Empty the range if not enough clusters
else:
    range_n_clusters = list(range(2, max_clusters + 1)) # Range from 2 to max_clusters

    silhouette_scores = []

    print(f"Evaluating silhouette scores for n_clusters from 2 to {max_clusters}...")

    for n_clusters in range_n_clusters:
        # Apply Agglomerative Clustering
        agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
        cluster_labels = agg_clustering.fit_predict(X_target_minority_three_groups)

        # Calculate silhouette score. Ensure there is more than one cluster and more than one sample.
        if len(np.unique(cluster_labels)) > 1 and len(X_target_minority_three_groups) > 1:
            score = silhouette_score(X_target_minority_three_groups, cluster_labels)
            silhouette_scores.append(score)
            print(f"  n_clusters = {n_clusters}, Silhouette Score = {score:.4f}")
        else:
            silhouette_scores.append(-1) # Append a low score if silhouette is not applicable
            print(f"  n_clusters = {n_clusters}, Silhouette Score not applicable (not enough clusters or samples)")


    # Find the optimal number of clusters based on the highest silhouette score
    if silhouette_scores:
        # Get the index of the maximum silhouette score, excluding -1 values
        valid_scores = [score for score in silhouette_scores if score > -1]
        if valid_scores:
            optimal_cluster_index = silhouette_scores.index(max(valid_scores))
            # Get the corresponding optimal n_clusters from the range
            optimal_n_clusters_three_groups = range_n_clusters[optimal_cluster_index]
            silhouette_avg = max(valid_scores)
            print(f"\nOptimal number of clusters based on silhouette score: {optimal_n_clusters_three_groups} (Silhouette Score: {silhouette_avg:.4f})")
        else:
            # Fallback if no valid silhouette scores could be calculated
            optimal_n_clusters_three_groups = min(5, X_target_minority_three_groups.shape[0]) # Use a default value
            silhouette_avg = -1
            print(f"\nCould not calculate valid silhouette scores, using default n_clusters = {optimal_n_clusters_three_groups}")
    else:
        # Fallback if silhouette_scores list is empty
        optimal_n_clusters_three_groups = min(5, X_target_minority_three_groups.shape[0]) # Use a default value
        silhouette_avg = -1
        print(f"\nCould not calculate any silhouette scores, using default n_clusters = {optimal_n_clusters_three_groups}")

    # Plot the silhouette scores
    if range_n_clusters and valid_scores:
        plt.figure(figsize=(8, 5))
        plt.plot(range_n_clusters, silhouette_scores, marker='o')
        plt.title("Silhouette Scores for Various Numbers of Clusters")
        plt.xlabel("Number of Clusters (n_clusters)")
        plt.ylabel("Silhouette Score")
        plt.xticks(range_n_clusters)
        plt.grid(True)
        plt.show()


# --- Apply Agglomerative Clustering with the Optimal Number of Clusters ---

# Ensure optimal_n_clusters_three_groups is at least 1
if optimal_n_clusters_three_groups < 1:
    optimal_n_clusters_three_groups = 1 # Cannot have 0 clusters


# Initialize and fit Agglomerative Clustering with the optimal number
agg_clustering_three_groups = AgglomerativeClustering(n_clusters=optimal_n_clusters_three_groups, linkage='ward')

# Get cluster labels for each point in X_target_minority_three_groups
cluster_labels_three_groups = agg_clustering_three_groups.fit_predict(X_target_minority_three_groups)

# Add the cluster labels back to the target_minority_points_info_three_groups DataFrame
# This links the cluster assignment to the information about each point
target_minority_points_info_three_groups['Cluster_Label'] = cluster_labels_three_groups

print(f"\nApplied Agglomerative Clustering with n_clusters = {optimal_n_clusters_three_groups} (Optimal)")
print("\nFirst 5 target minority points with their assigned cluster labels:")
display(target_minority_points_info_three_groups.head())

print("\nDistribution of points across clusters:")
print(target_minority_points_info_three_groups['Cluster_Label'].value_counts().sort_index())

# You might want to visualize the dendrogram to help select the number of clusters
# This requires importing and using scipy.cluster.hierarchy
# However, for large number of points, dendrogram can be very large.
# We will skip dendrogram visualization for now, but you can add it if needed.

### Analyzing Cluster Composition and Density (Currently density is kept in no use mode)

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean # Needed for distance calculation

# Ensure target_minority_points_info_three_groups and X_filtered are available
# Ensure probability_threshold is available

# Calculate cluster centroids and average distances (inverse density) for each cluster
cluster_density_info = {}
k_for_density_calculation = 4 # Use the same k as for zone classification or adjust as needed

# Get the feature data corresponding to the clustered minority points
X_target_minority_three_groups = X_filtered[target_minority_points_info_three_groups['Filtered_Index']]

# Iterate through each cluster
for cluster_label in sorted(target_minority_points_info_three_groups['Cluster_Label'].unique()):
    # Get points belonging to this cluster
    points_in_cluster_info = target_minority_points_info_three_groups[
        target_minority_points_info_three_groups['Cluster_Label'] == cluster_label
    ]

    # Get the feature data for points in this cluster
    X_cluster = X_filtered[points_in_cluster_info['Filtered_Index']]

    if len(X_cluster) > 0:
        # Calculate cluster centroid
        cluster_centroid = np.mean(X_cluster, axis=0)

        # Calculate average distance to the cluster centroid
        distances_to_centroid = np.array([euclidean(point, cluster_centroid) for point in X_cluster])
        average_distance_to_centroid = np.mean(distances_to_centroid) if len(distances_to_centroid) > 0 else 0.0

        # Calculate inverse average distance to centroid (as a density measure)
        epsilon_density_centroid = 1e-8 # Small value to avoid division by zero
        inverse_average_distance_to_centroid = 1 / (average_distance_to_centroid + epsilon_density_centroid)


        # --- Density Calculation: Average Distance to ALL Filtered Points ---
        # This gives a measure of density relative to the entire filtered dataset
        avg_distances_to_all_filtered = []
        for point_in_cluster_filtered_idx in points_in_cluster_info['Filtered_Index']:
             point = X_filtered[point_in_cluster_filtered_idx]
             # Calculate distances to all other points in X_filtered
             distances = np.array([euclidean(point, other_point) for other_point in X_filtered])

             # Average distance to all other points (excluding self distance)
             # Handle cases where there are very few points
             if len(distances) > 1:
                  avg_distance = np.mean(distances[distances > 0])
             elif len(distances) == 1:
                  avg_distance = distances[0] # Distance to itself is 0, but if only one point, avg dist to others is not well-defined. Could use a large value or 0.
             else:
                  avg_distance = 0.0 # No points

             avg_distances_to_all_filtered.append(avg_distance)

        # Calculate the overall average of these average distances for the cluster
        # Use a small epsilon to avoid division by zero if all avg_distances are 0
        overall_avg_distance_for_cluster = np.mean(avg_distances_to_all_filtered) if len(avg_distances_to_all_filtered) > 0 else 0.0
        epsilon_density = 1e-8
        inverse_density_all_filtered = 1 / (overall_avg_distance_for_cluster + epsilon_density) if overall_avg_distance_for_cluster > 0 else 0.0 # Handle case where avg distance is 0


        # Analyze cluster composition (proportions of the three subgroups)
        total_points_in_cluster = len(points_in_cluster_info)
        dangerous_count = points_in_cluster_info[points_in_cluster_info['Minority_Subgroup'] == 'Dangerous Minority'].shape[0]
        safe_low_conf_count = points_in_cluster_info[points_in_cluster_info['Minority_Subgroup'] == 'Safe Minority Low Conf'].shape[0]
        safe_high_conf_count = points_in_cluster_info[points_in_cluster_info['Minority_Subgroup'] == 'Safe Minority High Conf'].shape[0]

        dangerous_proportion_in_cluster = dangerous_count / total_points_in_cluster if total_points_in_cluster > 0 else 0.0
        safe_low_conf_proportion_in_cluster = safe_low_conf_count / total_points_in_cluster if total_points_in_cluster > 0 else 0.0
        safe_high_conf_proportion_in_cluster = safe_high_conf_count / total_points_in_cluster if total_points_in_cluster > 0 else 0.0

        cluster_density_info[cluster_label] = {
            'Total_Points': total_points_in_cluster,
            'Dangerous_Count': dangerous_count,
            'Safe_Low_Conf_Count': safe_low_conf_count,
            'Safe_High_Conf_Count': safe_high_conf_count,
            'Dangerous_Proportion': dangerous_proportion_in_cluster,
            'Safe_Low_Conf_Proportion': safe_low_conf_proportion_in_cluster,
            'Safe_High_Conf_Proportion': safe_high_conf_proportion_in_cluster,
            'Inverse_Density_All_Filtered': inverse_density_all_filtered, # Existing density metric (now the primary one for allocation)
            'Average_Distance_to_Centroid': average_distance_to_centroid, # Still calculated, but not used for primary density in allocation
            'Inverse_Average_Distance_to_Centroid': inverse_average_distance_to_centroid # Still calculated, but not used for primary density in allocation
        }
    else:
         # Handle empty clusters if any (unlikely with Agglomerative Clustering on non-empty data)
         cluster_density_info[cluster_label] = {
            'Total_Points': 0,
            'Dangerous_Count': 0,
            'Safe_Low_Conf_Count': 0,
            'Safe_High_Conf_Count': 0,
            'Dangerous_Proportion': 0.0,
            'Safe_Low_Conf_Proportion': 0.0,
            'Safe_High_Conf_Proportion': 0.0,
            'Inverse_Density_All_Filtered': 0.0,
            'Average_Distance_to_Centroid': 0.0,
            'Inverse_Average_Distance_to_Centroid': 0.0
        }


# Convert the dictionary to a DataFrame for easier analysis
cluster_analysis_df = pd.DataFrame.from_dict(cluster_density_info, orient='index').reset_index().rename(columns={'index': 'Cluster_Label'})

print("\nCluster Composition and Density Analysis (using average distance to all filtered points as density):") # Updated message
display(cluster_analysis_df)

### Calculating Cluster Allocation Weights

In [None]:
# Calculate allocation weights based on cluster composition and inverse density

# Add columns for normalized inverse density and normalized target proportion
# Use a small epsilon to avoid division by zero during normalization
epsilon_norm = 1e-8

# Normalize Inverse Density across all clusters
# Corrected: Use the new centroid-based inverse density column
total_inverse_density = cluster_analysis_df['Inverse_Average_Distance_to_Centroid'].sum()
if total_inverse_density > 0:
    cluster_analysis_df['Normalized_Inverse_Density'] = cluster_analysis_df['Inverse_Average_Distance_to_Centroid'] / total_inverse_density
else:
    cluster_analysis_df['Normalized_Inverse_Density'] = 0.0

# Calculate the proportion of "harder" minority points (Dangerous + Safe Low Conf) in each cluster
cluster_analysis_df['Harder_Proportion_in_Cluster'] = cluster_analysis_df['Dangerous_Proportion'] + cluster_analysis_df['Safe_Low_Conf_Proportion']


# Calculate the total number of "harder" minority points across all target minority points
total_dangerous_points = target_minority_points_info_three_groups[target_minority_points_info_three_groups['Minority_Subgroup'] == 'Dangerous Minority'].shape[0]
total_safe_low_conf_points = target_minority_points_info_three_groups[target_minority_points_info_three_groups['Minority_Subgroup'] == 'Safe Minority Low Conf'].shape[0]
total_harder_minority_points = total_dangerous_points + total_safe_low_conf_points


# Calculate the proportion of "harder" minority points in each cluster relative to the total harder points
# This will be used for the composition-based weighting
cluster_analysis_df['Proportion_of_Total_Harder'] = (cluster_analysis_df['Dangerous_Count'] + cluster_analysis_df['Safe_Low_Conf_Count']) / (total_harder_minority_points + epsilon_norm)


# Calculate the combined allocation weight for each cluster based on the modified logic
allocation_weights = []
for index, row in cluster_analysis_df.iterrows():
    cluster_label = row['Cluster_Label']
    normalized_inverse_density = row['Normalized_Inverse_Density'] # Keep for potential future use or analysis, but not for allocation weight
    dangerous_count = row['Dangerous_Count']
    safe_low_conf_count = row['Safe_Low_Conf_Count']
    proportion_of_total_harder = row['Proportion_of_Total_Harder']


    # Modified Logic for Allocation Weight: Based ONLY on the proportion of "harder" points in the cluster
    weight = proportion_of_total_harder

    allocation_weights.append(weight)

cluster_analysis_df['Combined_Allocation_Weight'] = allocation_weights

# Normalize the combined allocation weights so they sum to 1
total_combined_weight = cluster_analysis_df['Combined_Allocation_Weight'].sum()
if total_combined_weight > 0:
     cluster_analysis_df['Normalized_Combined_Allocation_Weight'] = cluster_analysis_df['Combined_Allocation_Weight'] / total_combined_weight
else:
     cluster_analysis_df['Normalized_Combined_Allocation_Weight'] = 0.0


# Calculate the number of samples to generate from each cluster
# Ensure synthetic_samples_needed is available
if 'synthetic_samples_needed' not in locals():
    print("Warning: 'synthetic_samples_needed' not found. Please run the cell calculating synthetic sample needs.")
    # For now, let's assume 0 samples to generate if synthetic_samples_needed is not defined
    synthetic_samples_needed = 0


cluster_analysis_df['Samples_to_Generate_in_Cluster'] = np.round(cluster_analysis_df['Normalized_Combined_Allocation_Weight'] * synthetic_samples_needed).astype(int)


# Adjust for potential rounding errors to ensure the total matches synthetic_samples_needed
total_generated_in_clusters = cluster_analysis_df['Samples_to_Generate_in_Cluster'].sum()
difference_in_allocation = synthetic_samples_needed - total_generated_in_clusters

if difference_in_allocation != 0 and len(cluster_analysis_df) > 0:
    # Get indices of clusters with highest weights (in descending order)
    highest_weight_cluster_indices = cluster_analysis_df['Normalized_Combined_Allocation_Weight'].argsort()[::-1]
    for i in range(abs(difference_in_allocation)):
        # Use modulo to cycle through the highest weight clusters if difference is larger than the number of clusters
        target_index_in_cluster_df = highest_weight_cluster_indices[i % len(cluster_analysis_df)]
        if difference_in_allocation > 0:
            cluster_analysis_df.loc[target_index_in_cluster_df, 'Samples_to_Generate_in_Cluster'] += 1
        else:
             # Decrement only if the current count is greater than 0 to avoid negative samples
            if cluster_analysis_df.loc[target_index_in_cluster_df, 'Samples_to_Generate_in_Cluster'] > 0:
                 cluster_analysis_df.loc[target_index_in_cluster_df, 'Samples_to_Generate_in_Cluster'] -= 1


print("\nCluster Analysis with Allocation Weights and Samples to Generate:")
display(cluster_analysis_df)

print(f"\nTotal synthetic samples to generate: {synthetic_samples_needed}")
print(f"Total samples allocated to clusters: {cluster_analysis_df['Samples_to_Generate_in_Cluster'].sum()}")

###  Synthetic Sample Generation and obtaining of balanced data

In [None]:
from scipy.spatial.distance import euclidean
import numpy as np
import pandas as pd # Import pandas to work with DataFrames if needed
# from imblearn.under_sampling import RandomUnderSampler # Consider if a more robust undersampling method is needed later

# Ensure X_filtered, y_filtered are available (filtered training data)
# Ensure synthetic_samples_needed is available (total number of minority samples to generate)
# Ensure final_filtered_info_df is available (contains zone and NOW updated confidence info for all filtered points)
# Ensure probability_threshold and ratio_threshold are available
# Ensure target_minority_points_info_three_groups is available (from Plan Step 1, contains Cluster_Label and Minority_Subgroup)
# Ensure cluster_analysis_df is available (from Plan Step 5, contains Samples_to_Generate_in_Cluster)


# Define k for neighbor selection during synthesis
k_for_synthesis_neighbors = 3

# Define epsilon for the alpha calculation
epsilon = 1e-6

# List to store generated synthetic samples (used only for oversampling)
synthetic_samples = []
synthetic_labels = [] # Should all be minority class (1)

print(f"Processing data balancing...")

# --- Conditional Balancing: Undersampling or Oversampling ---

if synthetic_samples_needed < 0:
    # --- Perform Undersampling ---
    print(f"Minority class ({len(y_filtered[y_filtered == 1])}) is larger than Majority class ({len(y_filtered[y_filtered == 0])}) in filtered data.")
    print(f"Undersampling minority class to match majority count: {target_minority_count}") # target_minority_count was calculated in cell nDcFANnxD9tX

    # Identify minority class indices in the filtered data
    minority_filtered_indices = np.where(y_filtered == 1)[0]

    # Calculate the number of minority samples to remove
    num_minority_to_remove = abs(synthetic_samples_needed)

    # Ensure we don't try to remove more samples than exist
    if num_minority_to_remove > len(minority_filtered_indices):
        print(f"Warning: Attempted to remove {num_minority_to_remove} minority samples, but only {len(minority_filtered_indices)} exist. Removing all minority samples.")
        num_minority_to_remove = len(minority_filtered_indices)


    # Randomly select indices of minority samples to remove
    indices_to_remove = np.random.choice(minority_filtered_indices, size=num_minority_to_remove, replace=False)

    # Create a mask to keep the samples that are not in indices_to_remove
    keep_mask_undersampling = np.isin(np.arange(len(X_filtered)), indices_to_remove, invert=True)

    # Create the undersampled dataset
    X_balanced_custom = X_filtered[keep_mask_undersampling]
    y_balanced_custom = y_filtered[keep_mask_undersampling]

    print(f"Undersampling complete. New data shape: {X_balanced_custom.shape}")


elif synthetic_samples_needed > 0:
    # --- Perform Oversampling (New Cluster-Based Logic) ---
    print(f"Minority class ({len(y_filtered[y_filtered == 1])}) is smaller than Majority class ({len(y_filtered[y_filtered == 0])}) in filtered data.")
    print(f"Generating {synthetic_samples_needed} synthetic samples to balance classes using cluster-based allocation.")

    # Ensure cluster_analysis_df with 'Samples_to_Generate_in_Cluster' is available
    if 'cluster_analysis_df' not in locals() or 'Samples_to_Generate_in_Cluster' not in cluster_analysis_df.columns:
        print("Error: Cluster analysis results with sample allocation not found. Please run Plan Steps 1-5 first.")
    else:
        # Iterate through each cluster based on the allocation
        for index, cluster_row in cluster_analysis_df.iterrows():
            cluster_label = cluster_row['Cluster_Label']
            num_samples_to_generate_from_cluster = int(cluster_row['Samples_to_Generate_in_Cluster'])

            if num_samples_to_generate_from_cluster > 0:
                print(f"Generating {num_samples_to_generate_from_cluster} samples from Cluster {cluster_label}...")

                # Get the points belonging to this cluster from target_minority_points_info_three_groups
                points_in_cluster_info = target_minority_points_info_three_groups[
                    target_minority_points_info_three_groups['Cluster_Label'] == cluster_label
                ].copy() # Use copy to avoid SettingWithCopyWarning

                # --- Calculate Kth_Neighbor_Distance (Inverse Density) for points within this cluster relative to X_filtered ---
                # This was previously done for the dangerous and safe low conf groups separately.
                # Now calculate it for all points in the current cluster within the context of X_filtered.
                k_for_density_calculation = 4 # Use the same k as for zone classification

                kth_neighbor_distances_cluster = []
                for idx, point_info in points_in_cluster_info.iterrows():
                     filtered_idx = int(point_info['Filtered_Index'])
                     point = X_filtered[filtered_idx] # Get the point from the filtered data using NumPy indexing

                     # Calculate distances to all other points in X_filtered
                     distances = np.array([euclidean(point, other_point) for other_point in X_filtered])

                     # Sort distances and get the k-th smallest distance (excluding the distance to itself)
                     sorted_distances = np.sort(distances)
                     # Ensure k_for_density_calculation is less than the number of points in X_filtered minus 1
                     if k_for_density_calculation < len(X_filtered):
                          # Find the index of the point itself in sorted_distances (it will be 0)
                          self_distance_index = np.where(sorted_distances == 0)[0]
                          if len(self_distance_index) > 0:
                               # If the k-th neighbor is the point itself (distance 0), take the next one
                               if k_for_density_calculation < len(sorted_distances):
                                   kth_distance = sorted_distances[k_for_density_calculation] # k-th smallest distance (0-indexed), skipping the 0 distance
                               else:
                                    # Fallback if not enough points after excluding self
                                    kth_distance = sorted_distances[-1] if len(sorted_distances) > 0 else 0.0 # Handle empty case
                          else:
                               # Should not happen for a point in the dataset, but as a fallback
                               kth_distance = sorted_distances[k_for_density_calculation -1] # k-th smallest distance (0-indexed)

                     else:
                          # If k is larger than or equal to the number of points, use the distance to the furthest point (excluding self)
                          distances_excluding_self = distances[distances > 0]
                          kth_distance = np.max(distances_excluding_self) if len(distances_excluding_self) > 0 else 0.0

                     kth_neighbor_distances_cluster.append(kth_distance)

                # Add Kth_Neighbor_Distance to the points_in_cluster_info DataFrame
                points_in_cluster_info['Kth_Neighbor_Distance'] = kth_neighbor_distances_cluster


                # --- Calculate Inverse Confidence for points within this cluster ---
                # Ensure 'Individual_Confidence_Minority' is available in points_in_cluster_info
                if 'Individual_Confidence_Minority' not in points_in_cluster_info.columns:
                     print("Error: 'Individual_Confidence_Minority' not found in points_in_cluster_info. Rerun previous steps.")
                     continue # Skip this cluster if confidence info is missing

                # Inverse confidence in the minority class
                # Use a small epsilon to avoid division by zero if confidence is 1.0
                epsilon_confidence = 1e-6
                points_in_cluster_info['Inverse_Minority_Confidence'] = 1 / (points_in_cluster_info['Individual_Confidence_Minority'] + epsilon_confidence)


                # --- Allocate Samples to Points within the Cluster based on Combined Inverse Density and Inverse Confidence ---

                distances_cluster = points_in_cluster_info['Kth_Neighbor_Distance'].values
                inverse_confidences_cluster = points_in_cluster_info['Inverse_Minority_Confidence'].values

                # Calculate allocation weights based on the combination of inverse density and inverse confidence
                # Option 1: Simple multiplication (High density AND High inverse confidence get high weight)
                # allocation_weights_cluster = (1 / (distances_cluster + epsilon_density_allocation)) * inverse_confidences_cluster

                # Option 2: Sum of normalized inverse density and normalized inverse confidence
                epsilon_density_allocation = 1e-6 # Small value to avoid division by zero
                inverse_distances_cluster = 1 / (distances_cluster + epsilon_density_allocation)

                # Normalize inverse distances within the cluster
                total_inverse_distance_cluster = np.sum(inverse_distances_cluster)
                if total_inverse_distance_cluster > 0:
                     normalized_inverse_distances_cluster = inverse_distances_cluster / total_inverse_distance_cluster
                else:
                     normalized_inverse_distances_cluster = np.zeros_like(inverse_distances_cluster)

                # Normalize inverse confidences within the cluster
                total_inverse_confidence_cluster = np.sum(inverse_confidences_cluster)
                if total_inverse_confidence_cluster > 0:
                     normalized_inverse_confidences_cluster = inverse_confidences_cluster / total_inverse_confidence_cluster
                else:
                     normalized_inverse_confidences_cluster = np.zeros_like(inverse_confidences_cluster)

                # Combine normalized weights (e.g., simple average)
                allocation_weights_cluster = (normalized_inverse_distances_cluster + normalized_inverse_confidences_cluster) / 2


                # Normalize the combined weights so they sum to 1 within the cluster
                # This is important to ensure the total allocated samples match the cluster's requirement
                total_combined_weight_cluster = np.sum(allocation_weights_cluster)
                if total_combined_weight_cluster > 0:
                    normalized_combined_weights_cluster = allocation_weights_cluster / total_combined_weight_cluster
                else:
                    normalized_combined_weights_cluster = np.zeros_like(allocation_weights_cluster)


                # Calculate number of samples to generate per point in this cluster
                num_samples_to_generate_per_point_cluster = np.round(normalized_combined_weights_cluster * num_samples_to_generate_from_cluster).astype(int)


                # Adjust for potential rounding errors
                total_generated_in_this_cluster = np.sum(num_samples_to_generate_per_point_cluster)
                difference_in_cluster_allocation = num_samples_to_generate_from_cluster - total_generated_in_this_cluster

                if difference_in_cluster_allocation != 0 and len(points_in_cluster_info) > 0:
                     # Get indices of points with highest weights (in descending order)
                     highest_weight_indices_cluster = np.argsort(normalized_combined_weights_cluster)[::-1]
                     for i in range(abs(difference_in_cluster_allocation)):
                          target_index_in_cluster_points_df = highest_weight_indices_cluster[i % len(highest_weight_indices_cluster)]
                          if difference_in_cluster_allocation > 0:
                               num_samples_to_generate_per_point_cluster[target_index_in_cluster_points_df] += 1
                          else:
                               if num_samples_to_generate_per_point_cluster[target_index_in_cluster_points_df] > 0:
                                    num_samples_to_generate_per_point_cluster[target_index_in_cluster_points_df] -= 1


                points_in_cluster_info['Samples_to_Generate_From_Point'] = num_samples_to_generate_per_point_cluster


                # --- Generate Synthetic Samples from points in this cluster ---
                source_point_indices_in_filtered_pool = []
                for idx, point_info in points_in_cluster_info.iterrows():
                     filtered_idx = int(point_info['Filtered_Index'])
                     samples_to_generate_from_this_point = int(point_info['Samples_to_Generate_From_Point'])
                     source_point_indices_in_filtered_pool.extend([filtered_idx] * samples_to_generate_from_this_point)


                # Shuffle the pool of indices to randomize the order of generation from source points
                np.random.shuffle(source_point_indices_in_filtered_pool)


                print(f"  Generating {len(source_point_indices_in_filtered_pool)} samples from {len(points_in_cluster_info)} source points in Cluster {cluster_label}...")


                for source_filtered_idx in source_point_indices_in_filtered_pool:
                    source_point = X_filtered[source_filtered_idx]

                    # Find k_for_synthesis_neighbors nearest neighbors in X_filtered
                    distances = np.array([euclidean(source_point, other_point) for other_point in X_filtered])
                    # Get indices of sorted distances, excluding the source point itself
                    if k_for_synthesis_neighbors < len(X_filtered):
                         nearest_neighbor_indices_in_filtered = np.argsort(distances)[1:k_for_synthesis_neighbors+1]
                    else:
                         nearest_neighbor_indices_in_filtered = np.argsort(distances)[1:]


                    available_neighbors_indices = nearest_neighbor_indices_in_filtered
                    if len(available_neighbors_indices) == 0:
                         # Fallback if no neighbors found
                         continue


                    # Randomly select a neighbor from the k nearest neighbors
                    neighbor_filtered_idx = np.random.choice(available_neighbors_indices)
                    neighbor_point = X_filtered[neighbor_filtered_idx]
                    neighbor_class_val = y_filtered[neighbor_filtered_idx] # Use the actual class label

                    # Determine alpha based on your existing rules
                    # Get info for the source point (it's a target minority point within a cluster)
                    # Use final_filtered_info_df which has updated confidence scores
                    source_info = final_filtered_info_df[final_filtered_info_df['Filtered_Index'] == source_filtered_idx]
                    if source_info.empty:
                         print(f"Warning: Could not find info for source point index {source_filtered_idx}. Skipping.")
                         continue
                    source_info = source_info.iloc[0] # Get the first (and should be only) row
                    source_zone = source_info['Zone']
                    source_minority_subgroup = source_info['Minority_Subgroup'] # Use the new subgroup
                    source_individual_confidence_minority = source_info['Individual_Confidence_Minority']
                    source_individual_confidence_majority = 1 - source_individual_confidence_minority


                    # Get info for the neighbor point
                    # Use final_filtered_info_df which has updated confidence scores
                    neighbor_info = final_filtered_info_df[final_filtered_info_df['Filtered_Index'] == neighbor_filtered_idx]
                    if neighbor_info.empty:
                         print(f"Warning: Could not find info for neighbor point index {neighbor_filtered_idx}. Skipping.")
                         continue
                    neighbor_info = neighbor_info.iloc[0] # Get the first (and should be only) row
                    neighbor_zone = neighbor_info['Zone']
                    neighbor_original_class = neighbor_info['Original_Class']
                    neighbor_individual_confidence_minority = neighbor_info['Individual_Confidence_Minority']
                    neighbor_individual_confidence_majority = neighbor_info['Individual_Confidence_Majority']

                    # Determine neighbor subgroup for minority neighbors
                    neighbor_minority_subgroup = None
                    if neighbor_original_class == 'Minority':
                        if neighbor_zone == 'dangerous':
                            neighbor_minority_subgroup = 'Dangerous Minority'
                        elif neighbor_zone == 'safe' and neighbor_individual_confidence_minority < probability_threshold:
                            neighbor_minority_subgroup = 'Safe Minority Low Conf'
                        elif neighbor_zone == 'safe' and neighbor_individual_confidence_minority >= probability_threshold:
                            neighbor_minority_subgroup = 'Safe Minority High Conf'
                        elif neighbor_zone == 'noise': # Added Noise Minority subgroup
                            neighbor_minority_subgroup = 'Noise Minority'


                    # Calculate uncertainties based on the user's definition
                    # Source is always Minority/Positive in this phase, so use minority confidence for source uncertainty
                    uncertainty_source = 1 - source_individual_confidence_minority

                    # Neighbor uncertainty depends on neighbor's class
                    # MODIFIED: ALWAYS use Individual Confidence Minority for neighbor uncertainty
                    uncertainty_neighbor = 1 - neighbor_individual_confidence_minority


                    # Apply the NEW adaptive alpha rules based on user's specifications

                    # Rule 1: Use formula from original Condition i)
                    # IF Source is Dangerous Minority AND Neighbor is (Any Majority OR Noise Minority)
                    rule1_match = False
                    if source_minority_subgroup == 'Dangerous Minority':
                         if neighbor_original_class == 'Majority' or neighbor_minority_subgroup == 'Noise Minority':
                              rule1_match = True

                    # OR IF Source is Safe Minority Low Conf AND Neighbor is (Safe Minority High Conf OR Noise Minority OR Any Majority)
                    elif source_minority_subgroup == 'Safe Minority Low Conf':
                         if neighbor_minority_subgroup == 'Safe Minority High Conf' or neighbor_minority_subgroup == 'Noise Minority' or neighbor_original_class == 'Majority':
                              rule1_match = True


                    # Rule 2: Use formula from original Condition ii)
                    # IF Source is Dangerous Minority AND Neighbor is (Safe Minority Low Conf OR Safe Minority High Conf OR Dangerous Minority)
                    rule2_match = False
                    if source_minority_subgroup == 'Dangerous Minority':
                         if neighbor_minority_subgroup == 'Safe Minority Low Conf' or neighbor_minority_subgroup == 'Safe Minority High Conf' or neighbor_minority_subgroup == 'Dangerous Minority':
                              rule2_match = True

                    # OR IF Source is Safe Minority Low Conf AND Neighbor is (Dangerous Minority OR Safe Minority Low Conf)
                    elif source_minority_subgroup == 'Safe Minority Low Conf':
                         if neighbor_minority_subgroup == 'Dangerous Minority' or neighbor_minority_subgroup == 'Safe Minority Low Conf':
                              rule2_match = True

                    # OR IF Source is Safe Minority High Conf AND Neighbor is (Any Majority OR Dangerous Minority OR Noise Minority OR Safe Minority Low Conf OR Safe Minority High Conf)
                    elif source_minority_subgroup == 'Safe Minority High Conf':
                          if neighbor_original_class == 'Majority' or neighbor_minority_subgroup == 'Dangerous Minority' or neighbor_minority_subgroup == 'Noise Minority' or neighbor_minority_subgroup == 'Safe Minority Low Conf' or neighbor_minority_subgroup == 'Safe Minority High Conf':
                              rule2_match = True


                    # Apply the formula based on which rule is met
                    if rule1_match:
                         alpha = uncertainty_source / (2 * (uncertainty_source + uncertainty_neighbor + epsilon))
                         # print(f"  Generated sample with alpha from Rule 1 ({alpha:.4f}) for Source: {source_minority_subgroup}, Neighbor: {neighbor_original_class}/{neighbor_minority_subgroup}") # Optional: for debugging

                    elif rule2_match:
                         alpha = uncertainty_source / (uncertainty_source + uncertainty_neighbor + epsilon)
                         # print(f"  Generated sample with alpha from Rule 2 ({alpha:.4f}) for Source: {source_minority_subgroup}, Neighbor: {neighbor_original_class}/{neighbor_minority_subgroup}") # Optional: for debugging

                    else:
                         # Fallback or handle cases not covered by the rules
                         # Based on user request, we should not use random alpha.
                         # However, if a source-neighbor combination doesn't match any rule,
                         # we need a defined behavior. A simple approach is to use alpha = 0.5,
                         # or potentially use one of the existing formulas as a default.
                         # Let's use alpha = 0.5 as a neutral fallback for now if no rule matches.
                         alpha = 0.5
                         print(f"  Warning: No specific rule matched for Source: {source_minority_subgroup}, Neighbor: {neighbor_original_class}/{neighbor_minority_subgroup}. Using alpha = {alpha:.4f}")


                    # Ensure alpha is between 0 and 1
                    alpha = np.clip(alpha, 0, 1)


                    # Generate synthetic sample using linear interpolation (Equation 22 description)
                    synthetic_point = source_point + alpha * (neighbor_point - source_point)

                    # Add the synthetic sample and label (always minority)
                    synthetic_samples.append(synthetic_point)
                    synthetic_labels.append(1)

    print(f"Finished generating {len(synthetic_samples)} synthetic samples.")

    # Convert the list of synthetic samples and labels to numpy arrays
    # ONLY perform vstack if synthetic_samples is not empty
    if synthetic_samples:
        X_synthetic_custom = np.array(synthetic_samples)
        y_synthetic_custom = np.array(synthetic_labels)

        # Combine the original filtered data with the generated synthetic data
        X_balanced_custom = np.vstack((X_filtered, X_synthetic_custom))
        y_balanced_custom = np.hstack((y_filtered, y_synthetic_custom))

        print(f"\nCombined oversampled data shape: {X_balanced_custom.shape}")

    else:
        print("\nNo synthetic samples generated. X_balanced_custom and y_balanced_custom are the same as X_filtered and y_filtered.")
        X_balanced_custom = X_filtered
        y_balanced_custom = y_filtered


else: # synthetic_samples_needed == 0
    # --- No Balancing Needed ---
    print(f"Filtered data is already balanced or has equal class counts ({len(y_filtered[y_filtered == 0])} Majority, {len(y_filtered[y_filtered == 1])}) Minority). No balancing applied.")
    X_balanced_custom = X_filtered
    y_balanced_custom = y_filtered
    print(f"Balanced data shape (same as filtered): {X_balanced_custom.shape}")


# Display the class distribution of the balanced data
unique_balanced, counts_balanced = np.unique(y_balanced_custom, return_counts=True)
balanced_class_distribution = dict(zip(unique_balanced, counts_balanced))
print(f"\nBalanced data class distribution: {balanced_class_distribution}")

# Update the variables used later to refer to the balanced dataset
X_oversampled_custom = X_balanced_custom
y_oversampled_custom = y_balanced_custom

### Evaluating Classifiers on Original, Filtered, and Balanced Data

In [None]:
from sklearn.metrics import recall_score, roc_auc_score, f1_score, matthews_corrcoef, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold # Import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier # Import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier # Import DecisionTreeClassifier
import numpy as np
from imblearn.metrics import geometric_mean_score
import pandas as pd

# Define classifiers to use - MODIFIED TO INCLUDE ONLY RBF SVM, MLP, and Decision Tree
classifiers = {
    # "Random Forest": RandomForestClassifier(random_state=42), # Removed
    # "Linear SVM": LinearSVC(random_state=42), # Removed
    "RBF SVM": SVC(gamma='auto', random_state=42, probability=True), # probability=True needed for roc_auc_score and G-mean
    # "Naive Bayes": GaussianNB(), # Removed
    "MLP": MLPClassifier(random_state=42, max_iter=1000),
    # "KNN": KNeighborsClassifier(), # Removed
    "Decision Tree": DecisionTreeClassifier(random_state=42) # Add DecisionTreeClassifier
}

# Define the metrics to report - MODIFIED TO INCLUDE ONLY THE REQUESTED METRICS
metrics_to_report = ["Recall", "ROC AUC", "G-mean", "F1-score", "Balanced Accuracy"]

# Define the number of splits for Stratified K-Fold Cross-Validation
n_splits = 5 # You can adjust the number of folds as needed
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)


# Function to evaluate a classifier using Stratified K-Fold
def evaluate_classifier_cv(clf, X_train_data, y_train_data, X_test_data, y_test_data, metrics_list, dataset_name):
    """
    Evaluates a classifier using Stratified K-Fold Cross-Validation on the training data
    and then evaluates the model trained on the full training data on the test data.

    Args:
        clf: The classifier to evaluate.
        X_train_data (np.ndarray): The training data features.
        y_train_data (np.ndarray): The training data labels.
        X_test_data (np.ndarray): The test data features.
        y_test_data (np.ndarray): The test data labels.
        metrics_list (list): A list of metric names to calculate.
        dataset_name (str): Name of the dataset (e.g., 'Original', 'Filtered').

    Returns:
        tuple: A tuple containing:
            - cv_results (dict): Dictionary of average CV scores for each metric.
            - test_results (dict): Dictionary of scores on the separate test set.
    """
    cv_scores = {metric: [] for metric in metrics_list}
    test_results = {}

    print(f"\n--- {clf.__class__.__name__} (Trained on {dataset_name}) ---")
    print(f"Performing {n_splits}-Fold Stratified Cross-Validation on {dataset_name} training data...")

    # Perform Stratified K-Fold Cross-Validation on the training data
    for fold, (train_index, val_index) in enumerate(skf.split(X_train_data, y_train_data)):
        X_train_fold, X_val_fold = X_train_data[train_index], X_train_data[val_index]
        y_train_fold, y_val_fold = y_train_data[train_index], y_train_data[val_index]

        # Create a fresh instance of the classifier for each fold to avoid state leakage
        fold_clf = type(clf)(**clf.get_params()) # Create new instance with same parameters

        try:
            fold_clf.fit(X_train_fold, y_train_fold)
            y_pred_val = fold_clf.predict(X_val_fold)

            # Calculate metrics for the current fold
            fold_metrics = {}
            if "Recall" in metrics_list:
                fold_metrics["Recall"] = recall_score(y_val_fold, y_pred_val)
            if "ROC AUC" in metrics_list:
                try:
                    if hasattr(fold_clf, "predict_proba"):
                        y_prob_val = fold_clf.predict_proba(X_val_fold)[:, 1]
                        fold_metrics["ROC AUC"] = roc_auc_score(y_val_fold, y_prob_val)
                    elif hasattr(fold_clf, "decision_function"):
                         y_prob_val = fold_clf.decision_function(X_val_fold)
                         fold_metrics["ROC AUC"] = roc_auc_score(y_val_fold, y_prob_val)
                    else:
                        fold_metrics["ROC AUC"] = np.nan
                except Exception as metric_e:
                     print(f"    Fold {fold+1}: Error calculating ROC AUC: {metric_e}")
                     fold_metrics["ROC AUC"] = np.nan
            if "G-mean" in metrics_list:
                 try:
                      fold_metrics["G-mean"] = geometric_mean_score(y_val_fold, y_pred_val)
                 except Exception as gmean_e:
                      print(f"    Fold {fold+1}: Error calculating G-mean: {gmean_e}")
                      fold_metrics["G-mean"] = np.nan
            if "F1-score" in metrics_list:
                 fold_metrics["F1-score"] = f1_score(y_val_fold, y_pred_val)
            if "Balanced Accuracy" in metrics_list:
                 fold_metrics["Balanced Accuracy"] = balanced_accuracy_score(y_val_fold, y_pred_val)

            # Append fold metrics to the list
            for metric, score in fold_metrics.items():
                cv_scores[metric].append(score)

            print(f"    Fold {fold+1} completed.")

        except Exception as e:
            print(f"    Error during training or evaluation of Fold {fold+1}: {e}")
            # Append None or np.nan for metrics if an error occurred
            for metric in metrics_list:
                 cv_scores[metric].append(np.nan)


    # Calculate average CV scores
    avg_cv_results = {metric: np.nanmean(scores) for metric, scores in cv_scores.items()} # Use nanmean to handle potential errors
    print(f"\nAverage CV Scores ({dataset_name} Training Data):")
    for metric, score in avg_cv_results.items():
        print(f"  {metric}: {score:.4f}")


    # Train the classifier on the *full* training data for evaluation on the test set
    print(f"\nTraining on full {dataset_name} training data for test set evaluation...")
    test_clf = type(clf)(**clf.get_params()) # Create a new instance for training on full data
    try:
        test_clf.fit(X_train_data, y_train_data)
        y_pred_test = test_clf.predict(X_test_data)

        # Calculate metrics on the separate Test Data
        if "Recall" in metrics_list:
            test_results["Recall"] = recall_score(y_test_data, y_pred_test)
        if "ROC AUC" in metrics_list:
            try:
                if hasattr(test_clf, "predict_proba"):
                    y_prob_test = test_clf.predict_proba(X_test_data)[:, 1]
                    test_results["ROC AUC"] = roc_auc_score(y_test_data, y_prob_test)
                elif hasattr(test_clf, "decision_function"):
                     y_prob_test = test_clf.decision_function(X_test_data)
                     test_results["ROC AUC"] = roc_auc_score(y_test_data, y_prob_test)
                else:
                    test_results["ROC AUC"] = np.nan
            except Exception as metric_e:
                 print(f"  Error calculating ROC AUC on test set: {metric_e}")
                 test_results["ROC AUC"] = np.nan

        if "G-mean" in metrics_list:
             try:
                  test_results["G-mean"] = geometric_mean_score(y_test_data, y_pred_test)
             except Exception as gmean_e:
                  print(f"  Error calculating G-mean on test set: {gmean_e}")
                  test_results["G-mean"] = np.nan
        if "F1-score" in metrics_list:
            test_results["F1-score"] = f1_score(y_test_data, y_pred_test)
        if "Balanced Accuracy" in metrics_list:
            test_results["Balanced Accuracy"] = balanced_accuracy_score(y_test_data, y_pred_test)

        print(f"\nTest Set Evaluation Results (Trained on full {dataset_name} Training Data):")
        for metric, score in test_results.items():
            print(f"  {metric}: {score:.4f}")

    except Exception as e:
        print(f"Error training on full {dataset_name} data or evaluating on test set: {e}")
        test_results = {metric: np.nan for metric in metrics_list} # Set all test metrics to NaN on error


    return avg_cv_results, test_results


# --- Evaluate classifiers trained on Original Data ---

print("="*50)
print("Evaluating Classifiers Trained on ORIGINAL Data")
results_original_cv = {}
results_original_test = {}

for name, clf in classifiers.items():
    cv_res, test_res = evaluate_classifier_cv(clf, X_train, y_train, X_test, y_test, metrics_to_report, 'Original')
    results_original_cv[name] = cv_res
    results_original_test[name] = test_res


print("\n" + "="*50 + "\n")

# --- Evaluate classifiers trained on Filtered Data ---

print("="*50)
print("Evaluating Classifiers Trained on FILTERED Data")
results_filtered_cv = {}
results_filtered_test = {}

# Define X_train_filtered and y_train_filtered to use X_filtered and y_filtered
X_train_filtered = X_filtered
y_train_filtered = y_filtered

for name, clf in classifiers.items():
    cv_res, test_res = evaluate_classifier_cv(clf, X_train_filtered, y_train_filtered, X_test, y_test, metrics_to_report, 'Filtered')
    results_filtered_cv[name] = cv_res
    results_filtered_test[name] = test_res


print("\n" + "="*50 + "\n")

# --- Evaluate classifiers trained on Custom Oversampled Data ---

print("="*50)
print("Evaluating Classifiers Trained on Custom Oversampled Data")
results_custom_oversampled_cv = {} # CV results on the oversampled data itself
results_custom_oversampled_test = {} # Test results after training on oversampled data

# Ensure X_oversampled_custom and y_oversampled_custom are available from previous steps.
# Ensure X_test and y_test are available from the original train/test split.

for name, clf in classifiers.items():
    # For the custom oversampled data, the CV is performed on the oversampled training set.
    # The final test evaluation is on the original, unseen test set.
    cv_res, test_res = evaluate_classifier_cv(clf, X_oversampled_custom, y_oversampled_custom, X_test, y_test, metrics_to_report, 'Custom Oversampled')
    results_custom_oversampled_cv[name] = cv_res
    results_custom_oversampled_test[name] = test_res


# Create DataFrames for CV results
original_cv_results_df = pd.DataFrame(results_original_cv).T
filtered_cv_results_df = pd.DataFrame(results_filtered_cv).T
custom_oversampled_cv_results_df = pd.DataFrame(results_custom_oversampled_cv).T # CV results on custom oversampled data

# Create DataFrames for Test results (trained on full data)
original_test_results_df = pd.DataFrame(results_original_test).T
filtered_test_results_df = pd.DataFrame(results_filtered_test).T
custom_oversampled_test_results_df = pd.DataFrame(results_custom_oversampled_test).T


print("\n" + "="*50 + "\n")
print("\nAverage Cross-Validation Results (Trained and Evaluated on Training Data Folds):")
print("\nOriginal Training Data CV Results:")
display(original_cv_results_df)

print("\nFiltered Training Data CV Results:")
display(filtered_cv_results_df)

print("\nCustom Oversampled Training Data CV Results:")
display(custom_oversampled_cv_results_df)


print("\n" + "="*50 + "\n")
print("\nTest Results (Trained on Full Training Data, Evaluated on Separate Test Set):")
print("\nOriginal Training Data Test Results:")
display(original_test_results_df)

print("\nFiltered Training Data Test Results:")
display(filtered_test_results_df)

print("\nCustom Oversampled Training Data Test Results:")
display(custom_oversampled_test_results_df)


# --- Save DataFrames to Excel ---

# Define the path for the Excel file
excel_file_path = 'evaluation_results_cv.xlsx' # New filename to differentiate

# Create an ExcelWriter object
# Use mode='w' to create or overwrite the file
with pd.ExcelWriter(excel_file_path, mode='w') as writer:
    # Save each DataFrame to a different sheet
    if not original_cv_results_df.empty:
        original_cv_results_df.to_excel(writer, sheet_name='Original_CV_Results')
    if not original_test_results_df.empty:
        original_test_results_df.to_excel(writer, sheet_name='Original_Test_Results')
    if not filtered_cv_results_df.empty:
        filtered_cv_results_df.to_excel(writer, sheet_name='Filtered_CV_Results')
    if not filtered_test_results_df.empty:
        filtered_test_results_df.to_excel(writer, sheet_name='Filtered_Test_Results')
    if not custom_oversampled_cv_results_df.empty:
        custom_oversampled_cv_results_df.to_excel(writer, sheet_name='Custom_Oversampled_CV_Results')
    if not custom_oversampled_test_results_df.empty:
        custom_oversampled_test_results_df.to_excel(writer, sheet_name='Custom_Oversampled_Test_Results')
    # Add the train_points_info_updated DataFrame to a new sheet (if it exists)
    if 'train_points_info_updated' in locals() and not train_points_info_updated.empty:
        train_points_info_updated.to_excel(writer, sheet_name='Train_Points_Info_Updated')
    # Add the final_filtered_info_df DataFrame to a new sheet (if it exists)
    if 'final_filtered_info_df' in locals() and not final_filtered_info_df.empty:
        final_filtered_info_df.to_excel(writer, sheet_name='Final_Filtered_Info')


print(f"\nEvaluation results (including CV) saved to '{excel_file_path}'")

CBFAS FINISH