In [1]:
# Feature Selection and RFE Classification Example

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset (change filename if needed)
df = pd.read_csv("prep.csv")

# Convert categorical variables
df = pd.get_dummies(df, drop_first=True)

# Define features and target
target_column = df.columns[-1]
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SelectKBest Feature Selection
selector = SelectKBest(score_func=chi2, k=5)
X_train_kbest = selector.fit_transform(abs(X_train_scaled), y_train)
X_test_kbest = selector.transform(abs(X_test_scaled))

print("Selected features (SelectKBest):", selector.get_support())

# RFE Feature Selection
model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe.transform(X_test_scaled)

print("Selected features (RFE):", rfe.get_support())

# Train models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

for name, clf in models.items():
    clf.fit(X_train_rfe, y_train)
    y_pred = clf.predict(X_test_rfe)
    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Selected features (SelectKBest): [False False False False False False False False False False False False
 False  True False False False  True False  True False False False  True
 False False  True]
Selected features (RFE): [False False  True False False False False False False  True  True False
 False False  True False False False False False False False  True False
 False False False]

Logistic Regression
Accuracy: 0.9875
Confusion Matrix:
 [[38  1]
 [ 0 41]]
Classification Report:
               precision    recall  f1-score   support

       False       1.00      0.97      0.99        39
        True       0.98      1.00      0.99        41

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80


SVM
Accuracy: 0.9875
Confusion Matrix:
 [[38  1]
 [ 0 41]]
Classification Report:
               precision    recall  f1-score   support

       False       1.00      0.97      0.99 