In [45]:
!pip install ucimlrepo



In [46]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest

In [47]:
# Read first dataset
isolet = fetch_ucirepo(id=54)

X = isolet.data.features
y = isolet.data.targets

In [48]:
# Print missing values and maximum and minimum values in the features of the first dataset
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)

print("Missing values in X:", X_df.isnull().sum().sum())
print("Missing values in y:", y_df.isnull().sum().sum())

print("Minimum value across all features:", X_df.min().min())
print("Maximum value across all features:", X_df.max().max())

Missing values in X: 0
Missing values in y: 0
Minimum value across all features: -1.0
Maximum value across all features: 1.0


In [49]:
# Normalize the first dataset
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_df)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

print(X_scaled_df.min().min())
print(X_scaled_df.max().max())

0.0
1.0000000000000002


In [50]:
# # Read second dataset
# spambase = fetch_ucirepo(id=94)

# X = spambase.data.features
# y = spambase.data.targets

In [51]:
# # Print missing values and maximum and minimum values in the features of the second dataset

# X_df = pd.DataFrame(X)
# y_df = pd.DataFrame(y)

# print("Missing values in X:", X_df.isnull().sum().sum())
# print("Missing values in y:", y_df.isnull().sum().sum())


# print("Minimum value across all features:", X_df.min().min())
# print("Maximum value across all features:", X_df.max().max())

In [52]:
# # Normalize the second dataset
# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X_df)

# X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns)

# print(X_scaled_df.min().min())
# print(X_scaled_df.max().max())

In [53]:
X = X_scaled_df
y = y_df.values.ravel()

In [54]:
def get_top_k_features(selector, feature_names, k, algorithm):
  """
  Get the top k features based on their scores from a SelectKBest selector.

  Parameters:
  selector (SelectKBest): Fitted SelectKBest object.
  feature_names (list): List of feature names (columns of X).
  k (int): Number of top features to select.
  algorithm (str): The name of the feature selection algorithm.

  Returns:
  list: Names of the top k features.
  """
  # Retrieve feature scores
  scores = selector.scores_

  # Create a DataFrame for ranked features
  feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Score': scores
  }).sort_values(by='Score', ascending=False)

  # Display top-ranked features
  # print(f"Feature Rankings using {algorithm}:")
  # print(feature_ranking)

  # Return selected top k features
  return feature_ranking.head(k)['Feature'].tolist()

In [55]:
def train_and_fit_model(X_train, X_test, y_train, y_test):
  """
  Builds, trains, and evaluates a Random Forest classification model.

  Parameters:
  ----------
  X_train : pd.DataFrame or np.ndarray
      Feature matrix for training the model.
  X_test : pd.DataFrame or np.ndarray
      Feature matrix for testing the model.
  y_train : pd.Series or np.ndarray
      Target labels for training the model.
  y_test : pd.Series or np.ndarray
      True target labels for testing the model.

  Returns:
  float: The accuracy of the model on the selected features
  """
  # Build a simple classification model
  model = RandomForestClassifier(random_state=42)
  model.fit(X_train, y_train)

  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  print("Model Accuracy:", accuracy)

  # Detailed performance metrics
  # print("\nClassification Report:")
  # print(classification_report(y_test, y_pred))

  # Return the accuracy of the model
  return accuracy

In [56]:
# Apply Chi-Square
from sklearn.feature_selection import chi2

results = []
for k in range(5, 51):
  # SelectKBest with chi2 evaluates all features
  selector = SelectKBest(score_func=chi2, k='all')
  selector.fit(X, y)

  # Rank the features using Chi-Square algorithm
  top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                    k=k, algorithm="Chi-Square")

  # Reduce the dataset to the top k features
  X_top = X[top_features]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_model(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")


Model Accuracy: 0.20064102564102565
Model Accuracy: 0.22179487179487178
Model Accuracy: 0.23653846153846153
Model Accuracy: 0.24871794871794872
Model Accuracy: 0.26346153846153847
Model Accuracy: 0.2724358974358974
Model Accuracy: 0.2935897435897436
Model Accuracy: 0.3141025641025641
Model Accuracy: 0.32371794871794873
Model Accuracy: 0.325
Model Accuracy: 0.36666666666666664
Model Accuracy: 0.3628205128205128
Model Accuracy: 0.3647435897435897
Model Accuracy: 0.39166666666666666
Model Accuracy: 0.3935897435897436
Model Accuracy: 0.39294871794871794
Model Accuracy: 0.40576923076923077
Model Accuracy: 0.40576923076923077
Model Accuracy: 0.4083333333333333
Model Accuracy: 0.4032051282051282
Model Accuracy: 0.4
Model Accuracy: 0.4269230769230769
Model Accuracy: 0.4288461538461538
Model Accuracy: 0.433974358974359
Model Accuracy: 0.43653846153846154
Model Accuracy: 0.44294871794871793
Model Accuracy: 0.43846153846153846
Model Accuracy: 0.45
Model Accuracy: 0.4737179487179487
Model Accuracy

In [58]:
# Apply Mutual Information Classification (MIC)
from sklearn.feature_selection import mutual_info_classif

# Wrap mutual_info_classif with a fixed random_state
mutual_info_classif_with_random_state = lambda X, y: mutual_info_classif(X, y, random_state=42)

results = []
for k in range(5, 51):

  # SelectKBest with mutual_info_classif evaluates all features
  selector = SelectKBest(score_func=mutual_info_classif_with_random_state, k='all')
  selector.fit(X, y)

  # Rank the features using Mutual Information Classification algorithm
  top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                    k=k, algorithm="Mutual Information Classification")

  # Reduce the dataset to the top k features
  X_top = X[top_features]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_model(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

Model Accuracy: 0.3269230769230769
Model Accuracy: 0.3217948717948718
Model Accuracy: 0.5012820512820513
Model Accuracy: 0.5108974358974359
Model Accuracy: 0.5378205128205128
Model Accuracy: 0.5326923076923077
Model Accuracy: 0.5391025641025641
Model Accuracy: 0.5217948717948718
Model Accuracy: 0.5570512820512821
Model Accuracy: 0.5596153846153846
Model Accuracy: 0.5717948717948718
Model Accuracy: 0.573076923076923
Model Accuracy: 0.5666666666666667
Model Accuracy: 0.6185897435897436
Model Accuracy: 0.6160256410256411
Model Accuracy: 0.642948717948718
Model Accuracy: 0.6608974358974359
Model Accuracy: 0.6576923076923077
Model Accuracy: 0.6705128205128205
Model Accuracy: 0.6762820512820513
Model Accuracy: 0.6923076923076923
Model Accuracy: 0.6833333333333333
Model Accuracy: 0.6884615384615385
Model Accuracy: 0.7070512820512821
Model Accuracy: 0.7121794871794872
Model Accuracy: 0.7294871794871794
Model Accuracy: 0.7198717948717949
Model Accuracy: 0.7262820512820513
Model Accuracy: 0.7294

In [None]:
# Apply Mutual Information Regression (MIR)
from sklearn.feature_selection import mutual_info_regression

results = []
for k in range(5, 51):
  # Wrapping mutual_info_regression with a fixed random_state
  mutual_info_regression_with_random_state = lambda X, y: mutual_info_regression(X, y, random_state=42)

  # SelectKBest with mutual_info_regression evaluates all features
  selector = SelectKBest(score_func=mutual_info_regression_with_random_state, k='all')
  selector.fit(X, y)

  # Rank the features using Mutual Information Regression algorithm
  top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                    k=k, algorithm="Mutual Information Regression")

  # Reduce the dataset to the top 20 features
  X_top = X[top_features]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_model(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

In [59]:
# Apply ANOVA F-value Classificaiton
from sklearn.feature_selection import f_classif

results = []
for k in range(5, 51):
  # SelectKBest with f_classif evaluates all features
  selector = SelectKBest(score_func=f_classif, k='all')
  selector.fit(X, y)

  # Rank the features using ANOVA F-value Classificaiton algorithm
  top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                    k=k, algorithm="ANOVA F-value Classificaiton")

  # Reduce the dataset to the top k features
  X_top = X[top_features]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_model(X_train, X_test, y_train, y_test)

  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

Model Accuracy: 0.23333333333333334
Model Accuracy: 0.2865384615384615
Model Accuracy: 0.30833333333333335
Model Accuracy: 0.46282051282051284
Model Accuracy: 0.4980769230769231
Model Accuracy: 0.5121794871794871
Model Accuracy: 0.5487179487179488
Model Accuracy: 0.5666666666666667
Model Accuracy: 0.5858974358974359
Model Accuracy: 0.6217948717948718
Model Accuracy: 0.6384615384615384
Model Accuracy: 0.6435897435897436
Model Accuracy: 0.6724358974358975
Model Accuracy: 0.6826923076923077
Model Accuracy: 0.683974358974359
Model Accuracy: 0.6948717948717948
Model Accuracy: 0.6993589743589743
Model Accuracy: 0.7153846153846154
Model Accuracy: 0.7352564102564103
Model Accuracy: 0.7448717948717949
Model Accuracy: 0.7416666666666667
Model Accuracy: 0.7487179487179487
Model Accuracy: 0.757051282051282
Model Accuracy: 0.7480769230769231
Model Accuracy: 0.757051282051282
Model Accuracy: 0.7583333333333333
Model Accuracy: 0.7628205128205128
Model Accuracy: 0.7705128205128206
Model Accuracy: 0.76

In [62]:
# Apply ANOVA F-value Regression
from sklearn.feature_selection import f_regression

results = []
for k in range(5, 51):
  # SelectKBest with f_regression evaluates all features
  selector = SelectKBest(score_func=f_regression, k='all')
  selector.fit(X, y)

  # Rank the features using ANOVA F-value Classificaiton algorithm
  top_features = get_top_k_features(selector=selector, feature_names=X.columns,
                                    k=k, algorithm="ANOVA F-value Regression")

  # Reduce the dataset to the top k features
  X_top = X[top_features]

  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

  # Train and fit random forest classification model based on feature selected
  accuracy = train_and_fit_model(X_train, X_test, y_train, y_test)
  results.append((k, accuracy))

# Find the best k
best_k, best_accuracy = max(results, key=lambda x: x[1])

print(f"Best k: {best_k}, Best Accuracy: {best_accuracy}")

Model Accuracy: 0.258974358974359
Model Accuracy: 0.30512820512820515
Model Accuracy: 0.35512820512820514
Model Accuracy: 0.4025641025641026
Model Accuracy: 0.46025641025641023
Model Accuracy: 0.48012820512820514
Model Accuracy: 0.5012820512820513
Model Accuracy: 0.5032051282051282
Model Accuracy: 0.517948717948718
Model Accuracy: 0.5352564102564102
Model Accuracy: 0.5365384615384615
Model Accuracy: 0.5307692307692308
Model Accuracy: 0.5371794871794872
Model Accuracy: 0.5551282051282052
Model Accuracy: 0.5532051282051282
Model Accuracy: 0.5814102564102565
Model Accuracy: 0.5897435897435898
Model Accuracy: 0.6006410256410256
Model Accuracy: 0.6
Model Accuracy: 0.6057692307692307
Model Accuracy: 0.6006410256410256
Model Accuracy: 0.6044871794871794
Model Accuracy: 0.610897435897436
Model Accuracy: 0.6121794871794872
Model Accuracy: 0.610897435897436
Model Accuracy: 0.6102564102564103
Model Accuracy: 0.6147435897435898
Model Accuracy: 0.6141025641025641
Model Accuracy: 0.6185897435897436
