In [None]:
!pip install ucimlrepo
!pip install pygad

In [None]:
import pandas as pd
import numpy as np
import pygad
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest


In [None]:
ISOLET_DB_INDEX: int = 54
SPAMBASE_DB_INDEX: int = 94

In [None]:
# Read first dataset
isolet = fetch_ucirepo(id=52) # Using this one cause it works.

features = isolet.data.features
target_variables = isolet.data.targets

In [None]:
# Print missing values and maximum and minimum values in the features of the first dataset
X_df = pd.DataFrame(features)
y_df = pd.DataFrame(target_variables)

print("Missing values in X:", X_df.isnull().sum().sum())
print("Missing values in y:", y_df.isnull().sum().sum())

print("Minimum value across all features:", X_df.min().min())
print("Maximum value across all features:", X_df.max().max())

In [None]:
# Normalize the first dataset
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_df)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

print(X_scaled_df.min().min())
print(X_scaled_df.max().max())

In [None]:
data_after_scaling = X_scaled_df
target_variables = y_df.values.ravel()

In [60]:
def get_top_k_features(selector,
                       feature_names: list,
                       top_features_to_select: int,
                       algorithm: str,
                       verbose: bool = False,
                       normalize_score: bool = True):
  """
  Get the top k features based on their scores from a SelectKBest selector.

  Parameters:
  selector (SelectKBest): Fitted SelectKBest object.
  feature_names (list): List of feature names (columns of X).
  k (int): Number of top features to select.
  algorithm (str): The name of the feature selection algorithm.

  Returns:
  list: Names of the top k features.
  """
  # Retrieve feature scores
  scores = selector.scores_

  if normalize_score:
      scores = scores / np.nansum(scores)

  feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores
  }).sort_values(by='Score', ascending=False)
  if verbose:
    # Display top-ranked features
    print(f"Feature Rankings using {algorithm}:")
    print(feature_ranking)


  # Return selected top k features
  return feature_ranking.head(top_features_to_select)['Feature'].tolist()

In [44]:
RANDOM_FOREST_SEED: int = 42

In [45]:
def train_and_fit_random_forest(X_train, X_test, y_train, y_test):
  """
  Builds, trains, and evaluates a Random Forest classification model.

  Parameters:
  ----------
  X_train : pd.DataFrame or np.ndarray
      Feature matrix for training the model.
  X_test : pd.DataFrame or np.ndarray
      Feature matrix for testing the model.
  y_train : pd.Series or np.ndarray
      Target labels for training the model.
  y_test : pd.Series or np.ndarray
      True target labels for testing the model.

  Returns:
  float: The accuracy of the model on the selected features
  """
  # Build a simple classification model
  model = RandomForestClassifier(random_state=RANDOM_FOREST_SEED)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  print("Model Accuracy:", accuracy)

  # Detailed performance metrics
  # print("\nClassification Report:")
  # print(classification_report(y_test, y_pred))

  # Return the accuracy of the model
  return accuracy


In [46]:
def get_top_features_with_selector(selector: SelectKBest,
                                    num_of_features_to_select: int,
                                    data_with_features,
                                    target_variables,
                                    algorithm: str = "",
                                    verbose: bool = False
                                    ):
      """
      :param selector: SelectKBest object.
      :param num_of_features_to_select:
      :param data_with_features: The features are selected from this data.
      :param target_variables: The variable the feature selection is used on.
      :param algorithm: The algorithm used, as a str. Used for debug printouts.
      :param verbose: Enable debug printouts.
      :return:
      """
      selector.fit(data_with_features, target_variables)

      # Rank the features using Chi-Square algorithm
      top_features = get_top_k_features(selector=selector, feature_names=features.columns,
                                        top_features_to_select=num_of_features_to_select, algorithm=algorithm,
                                        verbose=verbose)

      return data_with_features[top_features]

In [47]:
MIN_FEATURES: int = 5
MAX_FEATURES: int = 10
TRAIN_TEST_SPLIT_RATIO: float = 0.2


Import all the feature selection algorithms.

In [48]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression

mutual_info_classif_with_random_state = lambda X, y: mutual_info_classif(X, y, random_state=42)
mutual_info_regression_with_random_state = lambda X, y: mutual_info_regression(X, y, random_state=42)

# Selects featuers based on the k best scores. Here k is 'all'.
classifier_chi2: SelectKBest = SelectKBest(score_func=chi2, k='all')
classifier_mutual_info_classif: SelectKBest = SelectKBest(score_func=mutual_info_classif_with_random_state, k='all')
classifier_mutual_info_regression: SelectKBest = SelectKBest(score_func=mutual_info_regression_with_random_state, k='all')
classifier_f_classif: SelectKBest = SelectKBest(score_func=f_classif, k='all')
classifier_f_regression: SelectKBest = SelectKBest(score_func=f_regression, k='all')

Preselecting all the features with each classifier to get a feature ranking.

In [49]:
FEATURES_TO_SELECT = 5

In [61]:
selector_list = [
    (classifier_chi2, "classifier_chi2"),
    (classifier_mutual_info_classif, "classifier_mutual_info_classif"),
    # (classifier_mutual_info_regression, "classifier_mutual_info_regression"),
    # (classifier_f_classif, "classifier_f_classif"),
    # (classifier_f_regression, "classifier_f_regression")
]
features_selected_by_each_algorithm: dict = dict()
for selector_in_list in selector_list:
    selector_in_list: tuple[SelectKBest, str]
    selector_in_list[0].fit(data_after_scaling, target_variables)
      # Rank the features using Chi-Square algorithm
    top_features = get_top_k_features(
        selector=selector_in_list[0],
        feature_names=features.columns,
        top_features_to_select=FEATURES_TO_SELECT,
        algorithm=selector_in_list[1],
        verbose=True
    )
    features_selected_by_each_algorithm[selector_in_list[1]] = top_features


SCORES SUM TO: 60.8102230477591
SCORES AFTER NORMALIZATION TO: 0.9999999999999999
Feature Rankings using classifier_chi2:
        Feature     Score
0    Attribute1  0.135475
4    Attribute5  0.129577
2    Attribute3  0.117055
6    Attribute7  0.091419
30  Attribute31  0.060236
28  Attribute29  0.043286
8    Attribute9  0.042549
32  Attribute33  0.039736
14  Attribute15  0.039173
20  Attribute21  0.038605
22  Attribute23  0.032156
7    Attribute8  0.030031
12  Attribute13  0.026251
13  Attribute14  0.025025
24  Attribute25  0.024425
10  Attribute11  0.017452
11  Attribute12  0.015605
15  Attribute16  0.012494
5    Attribute6  0.012174
18  Attribute19  0.011451
21  Attribute22  0.010380
17  Attribute18  0.010152
3    Attribute4  0.008509
9   Attribute10  0.008299
26  Attribute27  0.006141
16  Attribute17  0.006029
33  Attribute34  0.002562
27  Attribute28  0.001711
19  Attribute20  0.001008
31  Attribute32  0.000988
23  Attribute24  0.000033
29  Attribute30  0.000012
25  Attribute26  0.0

In [51]:
function_weights = np.array([-4, 5, 7, 2.3, -2, 9, 9, -2, 5, 12, 57, 42, 42, 1, -3])
desired_output = 100
epsilon = 0.00001 # To prevent division by 0 errors.
potential_solutions = np.random.randint(-10, 10, (len(function_weights),len(function_weights)))
potential_solutions

array([[  8,  -1,  -9,  -9,  -9,  -7,  -4,  -8,  -7,   6,   3,   9,  -1,
          5,  -7],
       [ -9,  -9,   8,  -7,  -9,   8,   2,  -6,  -2,  -2,   8,  -2,   9,
          8,   4],
       [  7,   5,   2,   3,   5,   3,   4,  -7,   1,   1,   1,   0,   3,
          0,   4],
       [ -8,  -2,  -1,   6,   8,   3, -10,   1,  -4,   7,   1, -10,  -9,
          4,  -6],
       [ -1,  -6,   3,  -5,  -5,   2,  -7,  -5,  -2,  -9,   7,   7,  -8,
         -1,   7],
       [ -5,   7,  -2,   1,   1,   9,  -8,   8,   3,  -1,   0,   6,  -7,
          7,   0],
       [  3,   4,   5,   3,   8,  -4,  -2,   7,  -5,  -6,   0,   0,   5,
          1,  -5],
       [  9,   8,  -6,   3,   8,   4,  -6,  -9,   0,   2, -10,   1,   0,
          9,   6],
       [  1,   0,   6,   8,  -5,   0,   5,  -1,  -5,  -1,  -3,   1,  -6,
         -1,   1],
       [ -7,  -1,  -8,   8,   6,   7, -10,  -1,   3,  -2,  -2,   1,  -9,
         -4,   5],
       [  7,   3,   3,  -6,  -7,   1,  -2,  -6,   4,  -5,  -6,  -5,  -5,
       

In [52]:
def fitness_func_arr_of_weights(ga_instance, solution, solution_idx):
    # Coded as an array of weights.
    # TODO Here we can insert the feature selection and train the model, then classify, to measure fitness.
    # TODO The higher fitness, the better.
    summed_rows = np.sum(potential_solutions, axis=1)
    solution_to_check = np.multiply(summed_rows, solution)
    output = np.sum(solution_to_check * function_weights)
    fitness = 1.0 / (np.abs(output - desired_output) + epsilon) #
    return fitness


In [53]:
#TODO Remove this afterwards.


In [54]:
def fitness_func_as_weights_to_use_from_each_algorithm(ga_instance, solution, solution_idx):
  X_train, X_test, y_train, y_test = train_test_split(X_top, target_variables, test_size=0.2, random_state=42)
  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)

In [55]:
num_generations = 1
num_parents_mating = 4

sol_per_pop = 12
num_genes = function_weights.size # Use this to control the number of feature selection potential solutions is used.

init_range_low = 0
init_range_high = 10

parent_selection_type = "sss" #steady-state selection, meaning it selects the parents with the highest fitness.
keep_parents = 1

crossover_type = "single_point" # Swaps the chromosomes from a certain index onwards between the parents.

mutation_type = "random"
mutation_percent_genes = 20
ga_instance = pygad.GA(num_generations=num_generations,
                       num_parents_mating=num_parents_mating, # Num of parents to select each generation.
                       fitness_func=fitness_func_arr_of_weights,
                       sol_per_pop=sol_per_pop, # Number of solutions per population.
                       num_genes=num_genes, # Effectively, the thing that is tweaked for each generation.
                       # gene_type=list[float], # The type of gene, meaning of each value inside a chromosome. Supports list.
                       init_range_low=init_range_low, # dependent on the gene type, the range of values to be generated.
                       init_range_high=init_range_high,
                       parent_selection_type=parent_selection_type,
                       keep_parents=keep_parents, # Number of parents to keep from current population.
                       # keep_elitism = 1, # The number of the solutions with the best fitness that will be kept for next generation.
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_by_replacement=True, # If the previous gene is replaced or not.
                       mutation_percent_genes=mutation_percent_genes, # The probability that each gene will be mutated
                       # crossover_type=crossover_func, Can be used to customize a crossover func.
                       # mutation_type=mutation_func, Can be used to customize a mutation func.
                       )

ga_instance.run()
print('--------------------------------------------------')
print(f'Generation: {num_generations}')
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))
summed_rows_out = np.sum(potential_solutions, axis=1)
solution_to_check_out = np.multiply(summed_rows_out, solution)
output_out = np.sum(solution_to_check_out * function_weights)
print("Predicted output based on the best solution : {prediction}".format(prediction=output_out))


--------------------------------------------------
Generation: 1
Parameters of the best solution : [5.36901268 9.69791802 9.74513585 3.4208652  9.02175631 1.56694692
 5.06362429 9.60525948 4.20334316 1.48336083 2.48495632 8.45101528
 2.19854153 6.65345884 2.29372961]
Fitness value of the best solution = 0.0033418051313543356
Predicted output based on the best solution : -1146.303393893933
