<a href="https://colab.research.google.com/github/alejandracuadros/quantum_ai_thesis/blob/main/Classical_feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# Load dataset from UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data"
df = pd.read_csv(url, header=None)

# Assign column names
columns = [f'feature_{i}' for i in range(34)] + ['target']
df.columns = columns

# Convert target labels from 'g'/'b' to binary 1/0
df['target'] = df['target'].map({'g': 1, 'b': 0})

# Separate features and labels
X = df.drop(columns='target')
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f"Loaded Ionosphere dataset from UCI.")
print(f"Number of features: {X.shape[1]}, Samples: {X.shape[0]}")
print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")


Loaded Ionosphere dataset from UCI.
Number of features: 34, Samples: 351
Train size: 280, Test size: 71


In [6]:
def evaluate_features(X_train_sel, X_test_sel, method_name):
    """Train SVM on selected features and print accuracy."""
    model = SVC(kernel='linear')
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)
    acc = accuracy_score(y_test, y_pred)
    print(f"[{method_name}] Accuracy: {acc:.4f}")
    return acc


# ANOVA F-test
print(" Feature Selection using ANOVA F-test")
anova_selector = SelectKBest(score_func=f_classif, k=6)
X_train_anova = anova_selector.fit_transform(X_train, y_train)
X_test_anova = anova_selector.transform(X_test)
anova_acc = evaluate_features(X_train_anova, X_test_anova, "ANOVA F-test")

# Mutual Information
print("\n Feature Selection using Mutual Information")
mi_selector = SelectKBest(score_func=mutual_info_classif, k=6)
X_train_mi = mi_selector.fit_transform(X_train, y_train)
X_test_mi = mi_selector.transform(X_test)
mi_acc = evaluate_features(X_train_mi, X_test_mi, "Mutual Information")

# Recursive Feature Elimination (RFE)
print("\n Feature Selection using RFE (Logistic Regression)")
rfe_model = LogisticRegression(solver='liblinear')
rfe_selector = RFE(rfe_model, n_features_to_select=6)
rfe_selector.fit(X_train, y_train)
X_train_rfe = rfe_selector.transform(X_train)
X_test_rfe = rfe_selector.transform(X_test)
rfe_acc = evaluate_features(X_train_rfe, X_test_rfe, "RFE")

# 🔹 9. Summary Table
print("\n Summary of Feature Selection Results:")
summary = pd.DataFrame({
    'Method': ['ANOVA F-test', 'Mutual Information', 'RFE'],
    'Test Accuracy': [anova_acc, mi_acc, rfe_acc]
})
print(summary)

# 10. Save selected features to CSV
selected_feature_sets = {
    'anova': X.columns[anova_selector.get_support()],
    'mutual_info': X.columns[mi_selector.get_support()],
    'rfe': X.columns[rfe_selector.get_support()]
}
pd.DataFrame(selected_feature_sets).to_csv("../comparison/selected_features.csv", index=False)
print("\n✅ Selected feature names saved to ../comparison/selected_features.csv")


 Feature Selection using ANOVA F-test
[ANOVA F-test] Accuracy: 0.8732

 Feature Selection using Mutual Information
[Mutual Information] Accuracy: 0.8310

 Feature Selection using RFE (Logistic Regression)


  f = msb / msw


[RFE] Accuracy: 0.8592

 Summary of Feature Selection Results:
               Method  Test Accuracy
0        ANOVA F-test       0.873239
1  Mutual Information       0.830986
2                 RFE       0.859155


OSError: Cannot save file into a non-existent directory: '../comparison'

In [None]:




# Convert target from 'g'/'b' to binary 1/0
df['target'] = df['target'].map({'g': 1, 'b': 0})

# Separate features and labels
X = df.drop(columns='target')
y = df['target']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Original number of features: {X.shape[1]}")
print(f"Train set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")
print("="*50)


def evaluate_features(X_train_sel, X_test_sel, method_name):
    """Train SVM on selected features and print accuracy."""
    model = SVC(kernel='linear')
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)
    acc = accuracy_score(y_test, y_pred)
    print(f"[{method_name}] Accuracy: {acc:.4f}")
    return acc


# 1. ANOVA F-test
print("🔹 Feature Selection using ANOVA F-test")
anova_selector = SelectKBest(score_func=f_classif, k=6)
X_train_anova = anova_selector.fit_transform(X_train, y_train)
X_test_anova = anova_selector.transform(X_test)
anova_acc = evaluate_features(X_train_anova, X_test_anova, "ANOVA F-test")

# 2. Mutual Information
print("\n🔹 Feature Selection using Mutual Information")
mi_selector = SelectKBest(score_func=mutual_info_classif, k=6)
X_train_mi = mi_selector.fit_transform(X_train, y_train)
X_test_mi = mi_selector.transform(X_test)
mi_acc = evaluate_features(X_train_mi, X_test_mi, "Mutual Information")

# 3. Recursive Feature Elimination (RFE) with Logistic Regression
print("\n🔹 Feature Selection using RFE")
rfe_model = LogisticRegression(solver='liblinear')
rfe_selector = RFE(rfe_model, n_features_to_select=6)
rfe_selector.fit(X_train, y_train)
X_train_rfe = rfe_selector.transform(X_train)
X_test_rfe = rfe_selector.transform(X_test)
rfe_acc = evaluate_features(X_train_rfe, X_test_rfe, "RFE")

# Summary table
print("\n📊 Summary of Feature Selection Results:")
summary = pd.DataFrame({
    'Method': ['ANOVA F-test', 'Mutual Information', 'RFE'],
    'Test Accuracy': [anova_acc, mi_acc, rfe_acc]
})
print(summary)

# Save the selected features for later use (optional)
selected_feature_sets = {
    'anova': X.columns[anova_selector.get_support()],
    'mutual_info': X.columns[mi_selector.get_support()],
    'rfe': X.columns[rfe_selector.get_support()]
}
pd.DataFrame(selected_feature_sets).to_csv("../comparison/selected_features.csv", index=False)
