# "LA_ID_12M_cont_0pad_PlusBasic" feature selection - Boruta

In [None]:
!pip install boruta

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st
import sklearn.ensemble as en
import sklearn.feature_selection as fs
import sklearn.metrics as mt
import sklearn.model_selection as ms
import sklearn.preprocessing as pp
import sklearn.tree as tr
import time as tm
import xgboost as xgb
from boruta import BorutaPy
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [None]:
RANDOM_STATE=4+8+15+16+23+42
NUM_FEATURES_TO_SELECT=10
CROSS_VALIDATION_FOLDS=10#3,5,10
CROSS_VALIDATION_REPEATS=100#10, 100, 1000

In [None]:
# Cargar datos
X_raw = pd.read_csv("/content/drive/MyDrive/Documentos/7. educación/uniovi/master/tfm/data/LA_ID_12M_cont_0pad_PlusBasic_features.csv")
y_raw = pd.read_csv("/content/drive/MyDrive/Documentos/7. educación/uniovi/master/tfm/data/LA_ID_12M_cont_0pad_PlusBasic_labels.csv")
y = y_raw.iloc[:, 1:].values.ravel() # Las etiquetas están en la segunda columna

In [None]:
# Guardar los nombres de las características
original_feature_names = X_raw.columns.tolist()

# Preprocesamiento
pipe = Pipeline([
  ("feature_elimination", fs.VarianceThreshold()),
  ("scaler", pp.StandardScaler())
])
X_preprocessed = pipe.fit_transform(X_raw)
preprocessed_feature_names = pipe.get_feature_names_out(input_features=original_feature_names)
X = pd.DataFrame(X_preprocessed, columns=preprocessed_feature_names)

In [None]:
# Definir la estrategia de validación cruzada
cv_strategy = ms.RepeatedStratifiedKFold(n_splits = CROSS_VALIDATION_FOLDS,
                               n_repeats = CROSS_VALIDATION_REPEATS,
                               random_state=RANDOM_STATE)

In [None]:
# Tomar el tiempo inicial
start_time = tm.time()

estimator = en.RandomForestClassifier(n_jobs=-1,
                                   class_weight="balanced",
                                   max_depth=5)

selector = BorutaPy(estimator = estimator,
                    n_estimators="auto",
                    random_state=RANDOM_STATE)
selector.fit(X.values, y)
selected_features_mask = selector.support_
selected_feature_names = X.columns[selected_features_mask].tolist()
X_selected = X.loc[:, selected_features_mask]

# Tomar el tiempo final
end_time = tm.time()

print(f"*** Tiempo: {(end_time - start_time):.2f} segundos")
print(f"*** Selector: BorutaPy")
print(f"*** Estimador: RandomForestClassifier")
print(
  f"*** Características totales/seleccionadas: "
  f"{len(original_feature_names)}/{X_selected.shape[1]}"
)
print(f"")
for feature_name in selected_feature_names:
    print(f"  - {feature_name}")
print("-" * 60)