In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SequentialFeatureSelector,RFE, f_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [39]:
X,y = fetch_openml(name='madelon',version=1,as_frame=True,return_X_y=True)
y=y.astype(int)

In [40]:
X.shape

(2600, 500)

In [41]:
y.shape

(2600,)

In [42]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [43]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.fit_transform(x_test)

In [44]:
logreg = LogisticRegression(max_iter=1000)
dtree = DecisionTreeClassifier(random_state=42)                 

In [45]:
logreg.fit(X_train_scaled,y_train)
dtree.fit(X_train_scaled,y_train)

In [46]:
print(f"Logistic Regression: {logreg.score(X_test_scaled,y_test)*100:.2f}%")
print(f"Decision Tree Classifier: {dtree.score(X_test_scaled,y_test)*100:.2f}%")

Logistic Regression: 55.51%
Decision Tree Classifier: 73.33%


In [50]:
sfs_forward = SequentialFeatureSelector(logreg, n_features_to_select=25, direction='forward')

# Train data: fit + transform
X_train_fwd = sfs_forward.fit_transform(X_train_scaled, y_train)

# Test data: فقط transform (من غير fit)
X_test_fwd = sfs_forward.transform(X_test_scaled)


In [53]:
logreg_forward = LogisticRegression(max_iter=1000)
logreg_forward.fit(X_train_fwd,y_train)
print(f"Logistic Regression after Forward Selection: {logreg_forward.score(X_test_fwd,y_test)*100:.2f}%")

Logistic Regression after Forward Selection: 64.36%


In [56]:
dtree_forward = DecisionTreeClassifier(random_state=42)
dtree_forward.fit(X_train_fwd,y_train)
print(f"Decision Tree Classifier after Forward Selection: {dtree_forward.score(X_test_fwd,y_test)*100:.2f}%")

Decision Tree Classifier after Forward Selection: 58.72%


In [63]:
sfs_backward = SequentialFeatureSelector(LogisticRegression(max_iter=1000),
                                        n_features_to_select=25, 
                                        direction='backward')

In [None]:
sfs_backward.fit(X_train_scaled,y_train)
X_train_bwd = sfs_backward.transform(X_train_scaled)
X_test_bwd = sfs_backward.transform(X_test_scaled)

In [67]:
for k in [10, 20, 25, 30]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = selector.fit_transform(X_train_scaled, y_train)
    X_test_kbest = selector.transform(X_test_scaled)


    logreg_kbest = LogisticRegression(max_iter=1000)
    logreg_kbest.fit(X_train_kbest, y_train)
    print(f"Logistic Regression with K={k} features: {logreg_kbest.score(X_test_kbest,y_test)*100:.2f}%")

Logistic Regression with K=10 features: 64.23%
Logistic Regression with K=20 features: 59.62%
Logistic Regression with K=25 features: 59.74%
Logistic Regression with K=30 features: 57.56%


In [68]:
for k in [10, 20, 25, 30]:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_kbest = selector.fit_transform(X_train_scaled, y_train)
    X_test_kbest = selector.transform(X_test_scaled)
    dtree_kbest = DecisionTreeClassifier(random_state=42)
    dtree_kbest.fit(X_train_kbest, y_train)
    print(f"Decision Tree Classifier with K={k} features: {dtree_kbest.score(X_test_kbest,y_test)*100:.2f}%")

Decision Tree Classifier with K=10 features: 76.92%
Decision Tree Classifier with K=20 features: 77.56%
Decision Tree Classifier with K=25 features: 76.03%
Decision Tree Classifier with K=30 features: 77.18%


In [71]:
rfe = RFE(
    estimator=LogisticRegression(max_iter=1000),
    n_features_to_select=15,
)

rfe.fit(X_train_scaled,y_train)
X_train_rfe = rfe.transform(X_train_scaled)
X_test_rfe = rfe.transform(X_test_scaled)
logreg_rfe = LogisticRegression(max_iter=1000)
logreg_rfe.fit(X_train_rfe,y_train)
print(f"Logistic Regression after RFE: {logreg_rfe.score(X_test_rfe,y_test)*100:.2f}%")

Logistic Regression after RFE: 59.36%


In [73]:
rfe = RFE(
    estimator=DecisionTreeClassifier(random_state=42),
    n_features_to_select=15,
)

rfe.fit(X_train_scaled,y_train)
X_train_rfe = rfe.transform(X_train_scaled)
X_test_rfe = rfe.transform(X_test_scaled)
dt_rfe = DecisionTreeClassifier(random_state=42)
dt_rfe.fit(X_train_rfe,y_train)
print(f"Desision tree after RFE: {dt_rfe.score(X_test_rfe,y_test)*100:.2f}%")

Desision tree after RFE: 81.28%


In [76]:
lasso = LogisticRegression(penalty='l2', max_iter=1000)
lasso.fit(X_train_scaled,y_train)
selected_features = np.where(lasso.coef_[0] != 0)[0]
X_train_lasso = X_train_scaled[:,selected_features]
X_test_lasso = X_test_scaled[:,selected_features]

In [78]:
logreg_lasso = LogisticRegression(max_iter=1000)
logreg_lasso.fit(X_train_lasso,y_train)
print(f"Logistic Regression after Lasso: {logreg_lasso.score(X_test_lasso,y_test)*100:.2f}%")

Logistic Regression after Lasso: 55.51%


In [80]:
pca = PCA(n_components=20, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
logreg_pca = LogisticRegression(max_iter=1000)
logreg_pca.fit(X_train_pca,y_train)
print(f"Logistic Regression after PCA: {logreg_pca.score(X_test_pca,y_test)*100:.2f}%")

Logistic Regression after PCA: 62.69%


In [82]:
dttree_pca = DecisionTreeClassifier(random_state=42)
dttree_pca.fit(X_train_pca,y_train)
print(f"Decision Tree Classifier after PCA: {dttree_pca.score(X_test_pca,y_test)*100:.2f}%")

Decision Tree Classifier after PCA: 67.69%
