In [1]:
!pip install ucimlrepo



In [2]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest

In [None]:
ISOLET_DB_INDEX: int = 54
SPAMBASE_DB_INDEX: int = 94

In [None]:
# Read first dataset
isolet = fetch_ucirepo(id=ISOLET_DB_INDEX)

features = isolet.data.features
target_variables = isolet.data.targets


In [13]:

# Print missing values and maximum and minimum values in the features of the first dataset
X_df = pd.DataFrame(features)
y_df = pd.DataFrame(target_variables)

print("Missing values in X:", X_df.isnull().sum().sum())
print("Missing values in y:", y_df.isnull().sum().sum())

print("Minimum value across all features:", X_df.min().min())
print("Maximum value across all features:", X_df.max().max())

Missing values in X: 0
Missing values in y: 0
Minimum value across all features: -1.0
Maximum value across all features: 1.0


In [14]:
# Normalize the first dataset
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_df)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

print(X_scaled_df.min().min())
print(X_scaled_df.max().max())

0.0
1.0


In [15]:
RANDOM_FOREST_SEED: int = 42

In [16]:
# # Read second dataset
# spambase = fetch_ucirepo(id=94)

# X = spambase.data.features
# y = spambase.data.targets

In [17]:
# # Print missing values and maximum and minimum values in the features of the second dataset

# X_df = pd.DataFrame(X)
# y_df = pd.DataFrame(y)

# print("Missing values in X:", X_df.isnull().sum().sum())
# print("Missing values in y:", y_df.isnull().sum().sum())


# print("Minimum value across all features:", X_df.min().min())
# print("Maximum value across all features:", X_df.max().max())

In [18]:
# # Normalize the second dataset
# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X_df)

# X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

# print(X_scaled_df.min().min())
# print(X_scaled_df.max().max())


In [19]:
data_after_scaling = X_scaled_df
target_variables = y_df.values.ravel()


In [20]:
def get_top_k_features(selector, feature_names: list, top_features_to_select: int, algorithm: str, verbose: bool = False):
  """
  Get the top k features based on their scores from a SelectKBest selector.

  Parameters:
  selector (SelectKBest): Fitted SelectKBest object.
  feature_names (list): List of feature names (columns of X).
  k (int): Number of top features to select.
  algorithm (str): The name of the feature selection algorithm.

  Returns:
  list: Names of the top k features.
  """
  # Retrieve feature scores
  scores = selector.scores_

  # Create a DataFrame for ranked features
  feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores
  }).sort_values(by='Score', ascending=False)
  if verbose:
    # Display top-ranked features
    print(f"Feature Rankings using {algorithm}:")
    print(feature_ranking)


  # Return selected top k features
  return feature_ranking.head(top_features_to_select)['Feature'].tolist()

In [21]:
def train_and_fit_random_forest(X_train, X_test, y_train, y_test):
  """
  Builds, trains, and evaluates a Random Forest classification model.

  Parameters:
  ----------
  X_train : pd.DataFrame or np.ndarray
      Feature matrix for training the model.
  X_test : pd.DataFrame or np.ndarray
      Feature matrix for testing the model.
  y_train : pd.Series or np.ndarray
      Target labels for training the model.
  y_test : pd.Series or np.ndarray
      True target labels for testing the model.

  Returns:
  float: The accuracy of the model on the selected features
  """
  # Build a simple classification model
  model = RandomForestClassifier(random_state=RANDOM_FOREST_SEED)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  print("Model Accuracy:", accuracy)

  # Detailed performance metrics
  # print("\nClassification Report:")
  # print(classification_report(y_test, y_pred))

  # Return the accuracy of the model
  return accuracy




In [22]:
def get_top_features_with_selector(selector: SelectKBest,
                                    num_of_features_to_select: int,
                                    data_with_features,
                                    target_variables,
                                    algorithm: str = "",
                                    verbose: bool = False
                                    ):
      """
      :param selector: SelectKBest object.
      :param num_of_features_to_select:
      :param data_with_features: The features are selected from this data.
      :param target_variables: The variable the feature selection is used on.
      :param algorithm: The algorithm used, as a str. Used for debug printouts.
      :param verbose: Enable debug printouts.
      :return:
      """
      selector.fit(data_with_features, target_variables)

      # Rank the features using Chi-Square algorithm
      top_features = get_top_k_features(selector=selector, feature_names=features.columns,
                                        top_features_to_select=num_of_features_to_select, algorithm=algorithm,
                                        verbose=verbose)

      return data_with_features[top_features]

In [23]:
MIN_FEATURES: int = 5
MAX_FEATURES: int = 10
TRAIN_TEST_SPLIT_RATIO: float = 0.2

In [24]:

# Apply Chi-Square
from sklearn.feature_selection import chi2

results = []
for k in range(MIN_FEATURES, MAX_FEATURES):
  # SelectKBest with chi2 evaluates all features
  X_top = get_top_features_with_selector(
      selector=SelectKBest(score_func=chi2, k='all'),
      num_of_features_to_select=k,
      data_with_features=data_after_scaling,
      target_variables=target_variables,
      algorithm="Chi-Square",
      verbose=True
  )

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, target_variables, test_size=TRAIN_TEST_SPLIT_RATIO, random_state=42)

  # Train and fit random forest classification model based on feature selected
  print("---------------------------------")
  print(f'Amount of features selected: {k}')
  accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")


Feature Rankings using Chi-Square:
        Feature     Score
0    Attribute1  8.238247
4    Attribute5  7.879616
2    Attribute3  7.118123
6    Attribute7  5.559238
30  Attribute31  3.662940
28  Attribute29  2.632248
8    Attribute9  2.587422
32  Attribute33  2.416334
14  Attribute15  2.382145
20  Attribute21  2.347565
22  Attribute23  1.955440
7    Attribute8  1.826200
12  Attribute13  1.596337
13  Attribute14  1.521784
24  Attribute25  1.485292
10  Attribute11  1.061258
11  Attribute12  0.948956
15  Attribute16  0.759780
5    Attribute6  0.740302
18  Attribute19  0.696323
21  Attribute22  0.631219
17  Attribute18  0.617339
3    Attribute4  0.517440
9   Attribute10  0.504685
26  Attribute27  0.373407
16  Attribute17  0.366595
33  Attribute34  0.155793
27  Attribute28  0.104016
19  Attribute20  0.061299
31  Attribute32  0.060061
23  Attribute24  0.001981
29  Attribute30  0.000722
25  Attribute26  0.000116
1    Attribute2       NaN
---------------------------------
Amount of features se

In [25]:
# Apply Mutual Information Classification (MIC)
from sklearn.feature_selection import mutual_info_classif

# Wrap mutual_info_classif with a fixed random_state
mutual_info_classif_with_random_state = lambda X, y: mutual_info_classif(X, y, random_state=42)

results = []
for k in range(MIN_FEATURES, MAX_FEATURES):
  # SelectKBest with mutual_info_classification evaluates all features
  X_top = get_top_features_with_selector(
      selector=SelectKBest(score_func=mutual_info_classif_with_random_state, k='all'),
      num_of_features_to_select=k,
      data_with_features=data_after_scaling,
      target_variables=target_variables,
      algorithm="Mutual Information Classification",
      verbose=True
  )

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, target_variables, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

Feature Rankings using Mutual Information Classification:
        Feature     Score
5    Attribute6  0.297178
4    Attribute5  0.292443
7    Attribute8  0.286131
20  Attribute21  0.279286
26  Attribute27  0.276162
32  Attribute33  0.259260
30  Attribute31  0.249639
28  Attribute29  0.248604
2    Attribute3  0.245162
13  Attribute14  0.243970
12  Attribute13  0.241300
15  Attribute16  0.238697
6    Attribute7  0.232773
23  Attribute24  0.221345
33  Attribute34  0.220364
14  Attribute15  0.217917
22  Attribute23  0.215758
31  Attribute32  0.207568
24  Attribute25  0.204661
21  Attribute22  0.201930
8    Attribute9  0.200808
3    Attribute4  0.196269
25  Attribute26  0.193824
17  Attribute18  0.189906
9   Attribute10  0.189552
11  Attribute12  0.185869
19  Attribute20  0.185076
27  Attribute28  0.183910
10  Attribute11  0.172547
18  Attribute19  0.160522
29  Attribute30  0.154623
16  Attribute17  0.152221
0    Attribute1  0.097375
1    Attribute2  0.000000
Model Accuracy: 0.92957746478873

In [26]:
# Apply Mutual Information Regression (MIR)
from sklearn.feature_selection import mutual_info_regression

results = []

# Wrapping mutual_info_regression with a fixed random_state
mutual_info_regression_with_random_state = lambda X, y: mutual_info_regression(X, y, random_state=42)

for k in range(MIN_FEATURES, MAX_FEATURES):
  # SelectKBest with mutual_info_regression evaluates all features
  X_top = get_top_features_with_selector(
      selector=SelectKBest(score_func=mutual_info_regression_with_random_state, k='all'),
      num_of_features_to_select=k,
      data_with_features=data_after_scaling,
      target_variables=target_variables,
      algorithm="Mutual Information Regression",
      verbose=True
  )
  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, target_variables, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

ValueError: could not convert string to float: 'g'

In [59]:
# Apply ANOVA F-value Classification
from sklearn.feature_selection import f_classif

results = []
for k in range(MIN_FEATURES, MAX_FEATURES):
  # SelectKBest with f_classif evaluates all features
  X_top = get_top_features_with_selector(
      selector=SelectKBest(score_func=f_classif, k='all'),
      num_of_features_to_select=k,
      data_with_features=data_after_scaling,
      target_variables=target_variables,
      algorithm="ANOVA F-value Classificaiton",
      verbose=True
  )
  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, target_variables, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

Model Accuracy: 0.23333333333333334
Model Accuracy: 0.2865384615384615
Model Accuracy: 0.30833333333333335
Model Accuracy: 0.46282051282051284
Model Accuracy: 0.4980769230769231
Model Accuracy: 0.5121794871794871
Model Accuracy: 0.5487179487179488
Model Accuracy: 0.5666666666666667
Model Accuracy: 0.5858974358974359
Model Accuracy: 0.6217948717948718
Model Accuracy: 0.6384615384615384
Model Accuracy: 0.6435897435897436
Model Accuracy: 0.6724358974358975
Model Accuracy: 0.6826923076923077
Model Accuracy: 0.683974358974359
Model Accuracy: 0.6948717948717948
Model Accuracy: 0.6993589743589743
Model Accuracy: 0.7153846153846154
Model Accuracy: 0.7352564102564103
Model Accuracy: 0.7448717948717949
Model Accuracy: 0.7416666666666667
Model Accuracy: 0.7487179487179487
Model Accuracy: 0.757051282051282
Model Accuracy: 0.7480769230769231
Model Accuracy: 0.757051282051282
Model Accuracy: 0.7583333333333333
Model Accuracy: 0.7628205128205128
Model Accuracy: 0.7705128205128206
Model Accuracy: 0.76

In [62]:
# Apply ANOVA F-value Regression
from sklearn.feature_selection import f_regression

results = []
for k in range(MIN_FEATURES, MAX_FEATURES):
  # SelectKBest with f_regression evaluates all features
  X_top = get_top_features_with_selector(
      selector=SelectKBest(score_func=f_regression, k='all'),
      num_of_features_to_select=k,
      data_with_features=data_after_scaling,
      target_variables=target_variables,
      algorithm="ANOVA F-value Regression",
      verbose=True
  )
  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, target_variables, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_random_forest(X_train, X_test, y_train, y_test)
  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

Model Accuracy: 0.258974358974359
Model Accuracy: 0.30512820512820515
Model Accuracy: 0.35512820512820514
Model Accuracy: 0.4025641025641026
Model Accuracy: 0.46025641025641023
Model Accuracy: 0.48012820512820514
Model Accuracy: 0.5012820512820513
Model Accuracy: 0.5032051282051282
Model Accuracy: 0.517948717948718
Model Accuracy: 0.5352564102564102
Model Accuracy: 0.5365384615384615
Model Accuracy: 0.5307692307692308
Model Accuracy: 0.5371794871794872
Model Accuracy: 0.5551282051282052
Model Accuracy: 0.5532051282051282
Model Accuracy: 0.5814102564102565
Model Accuracy: 0.5897435897435898
Model Accuracy: 0.6006410256410256
Model Accuracy: 0.6
Model Accuracy: 0.6057692307692307
Model Accuracy: 0.6006410256410256
Model Accuracy: 0.6044871794871794
Model Accuracy: 0.610897435897436
Model Accuracy: 0.6121794871794872
Model Accuracy: 0.610897435897436
Model Accuracy: 0.6102564102564103
Model Accuracy: 0.6147435897435898
Model Accuracy: 0.6141025641025641
Model Accuracy: 0.6185897435897436
