In [None]:
!pip install ucimlrepo


In [None]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest

In [None]:
# Read first dataset
isolet = fetch_ucirepo(id=54)

features = isolet.data.features
target_variables = isolet.data.targets

In [None]:
# Print missing values and maximum and minimum values in the features of the first dataset
X_df = pd.DataFrame(features)
y_df = pd.DataFrame(target_variables)

print("Missing values in X:", X_df.isnull().sum().sum())
print("Missing values in y:", y_df.isnull().sum().sum())

print("Minimum value across all features:", X_df.min().min())
print("Maximum value across all features:", X_df.max().max())

In [None]:
# Normalize the first dataset
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_df)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

print(X_scaled_df.min().min())
print(X_scaled_df.max().max())

In [None]:
features = X_scaled_df
target_variables = y_df.values.ravel()



In [None]:
def get_top_k_features(selector, feature_names: list, top_features_to_select: int, algorithm: str, verbose: bool = False):
  """
  Get the top k features based on their scores from a SelectKBest selector.

  Parameters:
  selector (SelectKBest): Fitted SelectKBest object.
  feature_names (list): List of feature names (columns of X).
  k (int): Number of top features to select.
  algorithm (str): The name of the feature selection algorithm.

  Returns:
  list: Names of the top k features.
  """
  # Retrieve feature scores
  scores = selector.scores_

  # Create a DataFrame for ranked features
  feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores
  }).sort_values(by='Score', ascending=False)
  if verbose:
    # Display top-ranked features
    print(f"Feature Rankings using {algorithm}:")
    print(feature_ranking)


  # Return selected top k features
  return feature_ranking.head(top_features_to_select)['Feature'].tolist()

In [None]:
RANDOM_FOREST_SEED: int = 42

In [None]:
def train_and_fit_random_forest(X_train, X_test, y_train, y_test):
  """
  Builds, trains, and evaluates a Random Forest classification model.

  Parameters:
  ----------
  X_train : pd.DataFrame or np.ndarray
      Feature matrix for training the model.
  X_test : pd.DataFrame or np.ndarray
      Feature matrix for testing the model.
  y_train : pd.Series or np.ndarray
      Target labels for training the model.
  y_test : pd.Series or np.ndarray
      True target labels for testing the model.

  Returns:
  float: The accuracy of the model on the selected features
  """
  # Build a simple classification model
  model = RandomForestClassifier(random_state=RANDOM_FOREST_SEED)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  print("Model Accuracy:", accuracy)

  # Detailed performance metrics
  # print("\nClassification Report:")
  # print(classification_report(y_test, y_pred))

  # Return the accuracy of the model
  return accuracy


In [None]:
def get_top_features_with_selector(selector: SelectKBest,
                                    num_of_features_to_select: int,
                                    data_with_features,
                                    target_variables,
                                    algorithm: str = "",
                                    verbose: bool = False
                                    ):
      """
      :param selector: SelectKBest object.
      :param num_of_features_to_select:
      :param data_with_features: The features are selected from this data.
      :param target_variables: The variable the feature selection is used on.
      :param algorithm: The algorithm used, as a str. Used for debug printouts.
      :param verbose: Enable debug printouts.
      :return:
      """
      selector.fit(data_with_features, target_variables)

      # Rank the features using Chi-Square algorithm
      top_features = get_top_k_features(selector=selector, feature_names=features.columns,
                                        top_features_to_select=num_of_features_to_select, algorithm=algorithm,
                                        verbose=verbose)

      return data_with_features[top_features]

In [None]:
MIN_FEATURES: int = 5
MAX_FEATURES: int = 10
TRAIN_TEST_SPLIT_RATIO: float = 0.2

Import all the feature selection algorithms.

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression

mutual_info_classif_with_random_state = lambda X, y: mutual_info_classif(X, y, random_state=42)
mutual_info_regression_with_random_state = lambda X, y: mutual_info_regression(X, y, random_state=42)

classifier_chi2: SelectKBest = SelectKBest(score_func=chi2, k='all')
classifier_mutual_info_classif: SelectKBest = SelectKBest(score_func=mutual_info_classif_with_random_state, k='all')
classifier_mutual_info_regression: SelectKBest = SelectKBest(score_func=mutual_info_regression_with_random_state, k='all')
classifier_f_classif: SelectKBest = SelectKBest(score_func=f_classif, k='all')
classifier_f_regression: SelectKBest = SelectKBest(score_func=f_regression, k='all')
