In [None]:
!pip install ucimlrepo
!pip install pygad

In [None]:
import pandas as pd
import numpy as np
import pygad
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest


In [None]:
# Indexes of relevant databases.
ISOLET_DB_INDEX: int = 54
SPAMBASE_DB_INDEX: int = 94

Change the following to true for much more robust error messages.

In [None]:
VERBOSE: bool = False # For more robust error messages

Importing and handling the dataset:

In [None]:
# Read dataset
dataset_currently_used = fetch_ucirepo(id=52)

features = dataset_currently_used.data.features
target_variables = dataset_currently_used.data.targets

In [None]:
# Print missing values and maximum and minimum values in the features of the first dataset
X_df = pd.DataFrame(features)
y_df = pd.DataFrame(target_variables)

print("Missing values in X:", X_df.isnull().sum().sum())
print("Missing values in y:", y_df.isnull().sum().sum())

print("Minimum value across all features:", X_df.min().min())
print("Maximum value across all features:", X_df.max().max())

In [None]:
# Normalize the dataset
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_df)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

print(X_scaled_df.min().min())
print(X_scaled_df.max().max())

In [None]:
data_after_scaling = X_scaled_df
target_variables = y_df.values.ravel()

Feature selection functions

In [None]:
def get_top_k_features(selector,
                       feature_names: list,
                       top_features_to_select: int,
                       algorithm: str,
                       verbose: bool = False,
                       normalize_score: bool = True):
  """
  Get the top k features based on their scores from a SelectKBest selector.

  Parameters:
  selector (SelectKBest): Fitted SelectKBest object.
  feature_names (list): List of feature names (columns of X).
  k (int): Number of top features to select.
  algorithm (str): The name of the feature selection algorithm.

  Returns:
  A dataframe that contains 2 columns: The first is "Feature" and is the feature name and the second is a score, normalization is dependent on the var sent..

  """
  # Retrieve feature scores
  scores = selector.scores_

  if normalize_score:
      scores = scores / np.nansum(scores)

  feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores
  }).sort_values(by='Score', ascending=False)
  if verbose:
    # Display top-ranked features
    print(f"Feature Rankings using {algorithm}:")
    print(feature_ranking)


  # Return selected top k features
  return feature_ranking.head(top_features_to_select)

In [None]:
RANDOM_FOREST_SEED: int = 42

In [None]:
def train_and_fit_random_forest(X_train, X_test, y_train, y_test, verbose: bool = VERBOSE):
  """
  Builds, trains, and evaluates a Random Forest classification model.

  Parameters:
  ----------
  X_train : pd.DataFrame or np.ndarray
      Feature matrix for training the model.
  X_test : pd.DataFrame or np.ndarray
      Feature matrix for testing the model.
  y_train : pd.Series or np.ndarray
      Target labels for training the model.
  y_test : pd.Series or np.ndarray
      True target labels for testing the model.

  Returns:
  float: The accuracy of the model on the selected features
  """
  # Build a simple classification model
  model = RandomForestClassifier(random_state=RANDOM_FOREST_SEED)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  if verbose:
    print("Model Accuracy:", accuracy)
    # Detailed performance metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

  # Return the accuracy of the model
  return accuracy


In [None]:
MIN_FEATURES: int = 5
MAX_FEATURES: int = 10
TRAIN_TEST_SPLIT_RATIO: float = 0.2


Import all the feature selection algorithms.

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression

mutual_info_classif_with_random_state = lambda X, y: mutual_info_classif(X, y, random_state=42)
mutual_info_regression_with_random_state = lambda X, y: mutual_info_regression(X, y, random_state=42)

# Selects features based on the k best scores. Here k is 'all'.
classifier_chi2: SelectKBest = SelectKBest(score_func=chi2, k='all')
classifier_mutual_info_classif: SelectKBest = SelectKBest(score_func=mutual_info_classif_with_random_state, k='all')
classifier_mutual_info_regression: SelectKBest = SelectKBest(score_func=mutual_info_regression_with_random_state, k='all')
classifier_f_classif: SelectKBest = SelectKBest(score_func=f_classif, k='all')
classifier_f_regression: SelectKBest = SelectKBest(score_func=f_regression, k='all')

Preselecting all the features with each classifier to get a feature ranking.

In [None]:
FEATURES_TO_SELECT = 5

In [None]:
# Fits all the feature selection algorithms.

selector_list = [
    (classifier_chi2, "classifier_chi2"),
    (classifier_mutual_info_classif, "classifier_mutual_info_classif"),
    (classifier_mutual_info_regression, "classifier_mutual_info_regression"),
    (classifier_f_classif, "classifier_f_classif"),
    (classifier_f_regression, "classifier_f_regression")
]
features_selected_by_each_algorithm: dict = dict()
for selector_in_list in selector_list:
    selector_in_list: tuple[SelectKBest, str]
    selector_in_list[0].fit(data_after_scaling, target_variables)
      # Rank the features using Chi-Square algorithm
    top_features = get_top_k_features(
        selector=selector_in_list[0],
        feature_names=features.columns,
        top_features_to_select=FEATURES_TO_SELECT,
        algorithm=selector_in_list[1],
    )
    features_selected_by_each_algorithm[selector_in_list[1]] = top_features


In [None]:
def weighted_combine_feature_scores(features_selected_by_each_algorithm_in_func: dict[str, pd.DataFrame],
                                    weights: list[float],
                                    verbose: bool = VERBOSE) -> pd.DataFrame:
    """
    Sums all the features selected by each algorithm, multiplying each feature by the weight corresponding to it's index.
    :param features_selected_by_each_algorithm_in_func:
    :param weights:
    :param verbose: Printing relevant messages.
    :return:
    """
    if len(features_selected_by_each_algorithm_in_func) != len(weights):
        raise ValueError("Number of weights does not match number of features selectors")
    combined_scores = pd.DataFrame({
        'Feature': [],
        'Score': []
    })
    index = 0
    for algorithm_name, features_selected_by_algorithm in features_selected_by_each_algorithm_in_func.items():
        if verbose:
            print(f'Combining scores for {algorithm_name}, its weight is: {weights[index]}')

        features_selected_by_algorithm['Score'] = features_selected_by_algorithm['Score'] * weights[index]
        combined_scores = pd.merge(combined_scores,
                               features_selected_by_algorithm,
                               on='Feature', how='outer', suffixes=('_df1', '_df2'))

        # Sum the scores where both exist, fill NaN with 0 for features that only exist in one of the dataframes
        combined_scores['Score'] = combined_scores['Score_df1'].fillna(0) + combined_scores['Score_df2'].fillna(0)

        # Drop the original score columns if not needed
        combined_scores = combined_scores[['Feature', 'Score']]
        index += 1
    return combined_scores

Genetic Algorithm part, will use the cells created above.

In [None]:
TEST_TRAIN_SPLIT = 0.2
RANDOM_STATE_OF_DATA_SPLIT = 42

In [None]:
# Genetic Algorithm parameters
NUMBER_OF_GENERATIONS = 1
NUMBER_OF_PARENTS_MATING = 4

SOLUTIONS_PER_POPULATION = 12
NUMBER_OF_GENES = len(features_selected_by_each_algorithm) # Use this to control the number of feature selection potential solutions is used.

INIT_RANGE_LOW = 0
INIT_RANGE_HIGH = 1

PARENT_SELECTION_TYPE = "sss" #steady-state selection, meaning it selects the parents with the highest fitness.
KEEP_PARENTS = 1

CROSSOVER_TYPE = "single_point" # Swaps the chromosomes from a certain index onwards between the parents.

MUTATION_TYPE = "random"
MUTATION_PERCENT_GENES = 20

Running the genetic algorithm, using the other feature selection methods

In [None]:
def fitness_func_as_weights_to_use_from_each_algorithm(ga_instance, solution, solution_idx):
    combined_scores = weighted_combine_feature_scores(features_selected_by_each_algorithm_in_func=features_selected_by_each_algorithm,
                                                      weights=solution)
    combined_sorted_scores = combined_scores.sort_values(by=['Score'], ascending=False)
    list_of_sorted_features = combined_sorted_scores.head(FEATURES_TO_SELECT)['Feature'].tolist()
    data_with_top_features = data_after_scaling[list_of_sorted_features]

    X_train, X_test, y_train, y_test = train_test_split(data_with_top_features, target_variables, test_size=TEST_TRAIN_SPLIT, random_state=RANDOM_STATE_OF_DATA_SPLIT)
    # Train and fit random forest classification model based on feature selected
    accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)
    if VERBOSE:
        print(f'model_accuracy: {accuracy}')
    return accuracy

In [None]:

ga_instance_feature_selection_algorithms = (
    pygad.GA(num_generations=NUMBER_OF_GENERATIONS,
             num_parents_mating=NUMBER_OF_PARENTS_MATING,  # Num of parents to select each generation.
             fitness_func=fitness_func_as_weights_to_use_from_each_algorithm,
             sol_per_pop=SOLUTIONS_PER_POPULATION,  # Number of solutions per population.
             num_genes=NUMBER_OF_GENES,  # Effectively, the thing that is tweaked for each generation.
             # gene_type=list[float], # The type of gene, meaning of each value inside a chromosome. Supports list.
             init_range_low=INIT_RANGE_LOW,  # dependent on the gene type, the range of values to be generated.
             init_range_high=INIT_RANGE_HIGH,
             parent_selection_type=PARENT_SELECTION_TYPE,
             keep_parents=KEEP_PARENTS,  # Number of parents to keep from current population.
             # keep_elitism = 1, # The number of the solutions with the best fitness that will be kept for next generation.
             crossover_type=CROSSOVER_TYPE,
             mutation_type=MUTATION_TYPE,
             mutation_by_replacement=True,  # If the previous gene is replaced or not.
             mutation_percent_genes=MUTATION_PERCENT_GENES,  # The probability that each gene will be mutated
             # crossover_type=crossover_func, Can be used to customize a crossover func.
             # mutation_type=mutation_func, Can be used to customize a mutation func.
             )
)

ga_instance_feature_selection_algorithms.run()
print('Running feature selection based on other feature selection algorithms')
print('--------------------------------------------------')
print(f'Generation: {NUMBER_OF_GENERATIONS}')
solution, solution_fitness, solution_idx = ga_instance_feature_selection_algorithms.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))


Running the genetic algorithm (With the same parameters as the previous genetic algorithm) but selecting the features directly.



In [None]:
def fitness_func_on_features_themselves(ga_instance, solution, solution_idx):
    solution: list[np.float64]

    raise NotImplementedError("Please implement selecting a feature based on the solution array")
    data_after_selecting_features =
    X_train, X_test, y_train, y_test = train_test_split(data_with_specific_features, target_variables, test_size=TEST_TRAIN_SPLIT, random_state=RANDOM_STATE_OF_DATA_SPLIT)
    # Train and fit random forest classification model based on feature selected
    accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)
    if VERBOSE:
        print(f'model_accuracy: {accuracy}')
    return accuracy

In [None]:
# raise NotImplementedError("Add the number of possible features here.")
NUMBER_OF_POSSIBLE_FEATURES: int = 123
GENE_VALUES: list = [0, 1]

In [None]:
gene_space = [GENE_VALUES for _ in range(NUMBER_OF_POSSIBLE_FEATURES)]

In [None]:
ga_instance_selecting_features_directly = (
    pygad.GA(num_generations=NUMBER_OF_GENERATIONS,
             num_parents_mating=NUMBER_OF_PARENTS_MATING,  # Num of parents to select each generation.
             fitness_func=fitness_func_on_features_themselves,
             sol_per_pop=SOLUTIONS_PER_POPULATION,  # Number of solutions per population.
             num_genes=NUMBER_OF_POSSIBLE_FEATURES,  # Effectively, the thing that is tweaked for each generation.
             # gene_type=list[float], # The type of gene, meaning of each value inside a chromosome. Supports list.
             init_range_low=INIT_RANGE_LOW,  # dependent on the gene type, the range of values to be generated.
             init_range_high=INIT_RANGE_HIGH,
             parent_selection_type=PARENT_SELECTION_TYPE,
             keep_parents=KEEP_PARENTS,  # Number of parents to keep from current population.
             # keep_elitism = 1, # The number of the solutions with the best fitness that will be kept for next generation.
             crossover_type=CROSSOVER_TYPE,
             mutation_type=MUTATION_TYPE,
             mutation_by_replacement=True,  # If the previous gene is replaced or not.
             mutation_percent_genes=MUTATION_PERCENT_GENES,  # The probability that each gene will be mutated
             # crossover_type=crossover_func, Can be used to customize a crossover func.
             # mutation_type=mutation_func, Can be used to customize a mutation func.
             gene_space=gene_space
             )
)

ga_instance_selecting_features_directly.run()
print('Running feature selection based on simple genetic algorithm')
print('--------------------------------------------------')
print(f'Generation: {NUMBER_OF_GENERATIONS}')
solution, solution_fitness, solution_idx = ga_instance_selecting_features_directly.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))

Training the model with the features selected by each algorithm individually to achieve ablation study.

In [None]:
# Checking the performance of the features selected by each model independently.

for feature_selection_method_name, features_selected_with_score in features_selected_by_each_algorithm.items():
    features_selected_by_algorithm = features_selected_with_score.head(FEATURES_TO_SELECT)['Feature'].tolist()
    data_afer_selecting_features = data_after_scaling[features_selected_by_algorithm]
    X_train, X_test, y_train, y_test = train_test_split(data_afer_selecting_features, target_variables, test_size=TEST_TRAIN_SPLIT, random_state=42)
    accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)
    print('------------------------------------------------')
    print(f'Feature selection method: {feature_selection_method_name}. Accuracy: {accuracy}')