In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Load the dataset
url = "https://raw.githubusercontent.com/akay6483/mmml-ecommerce/refs/heads/main/dataset/online_shoppers_intention.csv"
df = pd.read_csv(url)


# 2. Preprocessing
# Encode categorical variables
categorical_cols = ['Month', 'VisitorType', 'Weekend', 'Revenue']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define X (Features) and y (Target)
X = df.drop('Revenue', axis=1)
y = df['Revenue']

# Standardize features (Important for many models used in selection like LogReg)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 3. Define the Estimator for Selection
# We need a base model to evaluate feature subsets.
# Logistic Regression is a good standard, but Decision Trees work well too.
model = LogisticRegression(max_iter=1000)

# 4. Implement Sequential Forward Selection (SFS)
print("Running Sequential Forward Selection... (This may take a moment)")

# 'n_features_to_select' can be a number (e.g., 5) or 'auto'
# 'direction' can be 'forward' (start with 0, add 1) or 'backward' (start with all, remove 1)
sfs = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='forward',
    scoring='accuracy',
    cv=3
)

sfs.fit(X_scaled, y)

# 5. Get Selected Features
selected_indices = sfs.get_support()
selected_features = X.columns[selected_indices]

print(f"\nTop 5 Features Selected: {list(selected_features)}")

# 6. Validation: Compare Performance
# Train on All Features
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
acc_all = accuracy_score(y_test, model.predict(X_test))

# Train on Selected Features
X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)
model.fit(X_train_sfs, y_train)
acc_sfs = accuracy_score(y_test, model.predict(X_test_sfs))

print(f"\nAccuracy (All 17 Features): {acc_all:.4f}")
print(f"Accuracy (Top 5 Features):  {acc_sfs:.4f}")
print(f"Difference: {acc_sfs - acc_all:.4f}")

Running Sequential Forward Selection... (This may take a moment)

Top 5 Features Selected: ['Informational_Duration', 'ExitRates', 'PageValues', 'Month', 'VisitorType']

Accuracy (All 17 Features): 0.8805
Accuracy (Top 5 Features):  0.8797
Difference: -0.0008
