# Recursive Feature Engineering and LDA

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import RFECV, SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

## Load Dataset

In [None]:
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

## Fit on baseline model before any transformations

In [None]:
results = {}
baseline_model = XGBClassifier(use_label_encoder=False, 
                               eval_metric='logloss', 
                               random_state=42)

baseline_model.fit(X_train, y_train)
y_pred_base = baseline_model.predict(X_test)
results['Baseline'] = [accuracy_score(y_test, y_pred_base), 
                       f1_score(y_test, y_pred_base)]

## Use LDA to reduce the complexity of the data

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

In [None]:
lda_model = XGBClassifier(use_label_encoder=False, 
                          eval_metric='logloss', 
                          random_state=42)
lda_model.fit(X_train_lda, y_train)

In [None]:
y_pred_lda = lda_model.predict(X_test_lda)
results['LDA Only'] = [
    accuracy_score(y_test, y_pred_lda), 
    f1_score(y_test, y_pred_lda)]

## Recursive Feature Elimination

In [None]:
rfecv_model = XGBClassifier(use_label_encoder=False, 
                            eval_metric='logloss', 
                            random_state=42)

In [None]:
rfecv = RFECV(
    estimator=rfecv_model, 
    step=1, cv=5, 
    scoring='f1')

X_train_rfecv = rfecv.fit_transform(X_train, y_train)
X_test_rfecv = rfecv.transform(X_test)

In [None]:
rfecv_model = XGBClassifier(use_label_encoder=False, 
                            eval_metric='logloss',
                            random_state=42)
rfecv_model.fit(X_train_rfecv, y_train)

In [None]:
y_pred_rfecv = rfecv_model.predict(X_test_rfecv)
results['RFECV Only'] = [
    accuracy_score(y_test, y_pred_rfecv), 
    f1_score(y_test, y_pred_rfecv)]

### RFECV and LDA

In [None]:
scaler_rfecv = StandardScaler()
X_train_rfecv_scaled = scaler_rfecv.fit_transform(X_train_rfecv)
X_test_rfecv_scaled = scaler_rfecv.transform(X_test_rfecv)

lda_rfecv = LDA(n_components=1)
X_train_rfecv_lda = lda_rfecv.fit_transform(X_train_rfecv_scaled, y_train)
X_test_rfecv_lda = lda_rfecv.transform(X_test_rfecv_scaled)

rfecv_lda_model = XGBClassifier(use_label_encoder=False, 
                                eval_metric='logloss', 
                                randpom_state=42)

rfecv_lda_model.fit(X_train_rfecv_lda, y_train)
y_pred_rfecv_lda = rfecv_lda_model.predict(X_test_rfecv_lda)
results['RFECV + LDA'] = [accuracy_score(y_test, y_pred_rfecv_lda), 
                          f1_score(y_test, y_pred_rfecv_lda)]


### RFECV + LDA + RandomSearch

In [None]:
param_grid = {
    'n_estimators': np.arange(50, 100, 10),
    'max_depth': np.arange(3, 6),
    'learning_rate': np.linspace(0.05, 0.2, 5),
    'subsample': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_distributions=param_grid,
    n_iter=5,
    cv=5,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_rfecv_lda, y_train)
best_model = random_search.best_estimator_
y_pred_final = best_model.predict(X_test_rfecv_lda)
results['RFECV + LDA + RS'] = [accuracy_score(y_test, y_pred_final), f1_score(y_test, y_pred_final)]

## Chi-Square

In [None]:
def feature_chi2_selection(X, y, k=5):
    X_non_neg = X - X.min()  
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X_non_neg, y)
    return X_new, selector

In [None]:
X_chi2_train, chi2_selector = feature_chi2_selection(X_train, 
                                                   y_train, 
                                                   k=5)
X_chi2_test = chi2_selector.transform(X_test - X_test.min())

In [None]:
chi2_only_model = XGBClassifier(use_label_encoder=False, 
                                eval_metric='logloss', random_state=42)
chi2_only_model.fit(X_chi2_train, y_train)
y_pred_chi2_only = chi2_only_model.predict(X_chi2_test)

In [None]:
results['Chi2 Only'] = [accuracy_score(y_test, y_pred_chi2_only), 
                        f1_score(y_test, y_pred_chi2_only)]

## Chi-Square + RFECV

In [None]:
rfecv_chi2_model = XGBClassifier(use_label_encoder=False, 
                                eval_metric='logloss', 
                                random_state=42)
rfecv_chi2 = RFECV(estimator=rfecv_chi2_model, 
                    step=1, cv=5, scoring='f1')
X_train_chi2_rfecv = rfecv_chi2.fit_transform(X_chi2_train, y_train)
X_test_chi2_rfecv = rfecv_chi2.transform(X_chi2_test)

In [None]:
rfecv_chi2_model.fit(X_train_chi2_rfecv, y_train)
y_pred_chi2_rfecv = rfecv_chi2_model.predict(X_test_chi2_rfecv)

In [None]:
results['Chi2 + RFECV'] = [accuracy_score(y_test, y_pred_chi2_rfecv), 
                           f1_score(y_test, y_pred_chi2_rfecv)]

## Evaluate and Compare Methods

In [None]:
comparison_df = pd.DataFrame(results, 
                             index=['Accuracy', 'F1 Score']).T
print(comparison_df.head())

In [None]:
comparison_df_reset = comparison_df.reset_index().rename(
    columns={'index': 'Method'})
comparison_df_reset

In [None]:
from hyperparameter.visualize import plot_hp_comparison
plot_hp_comparison(comparison_df_reset, 
                   ['Accuracy', 'F1 Score'], 
                   figsize=(16, 8), 
                   x_label_rotation=None,
                   label_fontsize=12,
                   label_position='center',
                   palette='Greys',
                   title='Feature Selection and Dimensionality Reduction comparison')
