# "LA_ID_12M_cont_0pad_PlusBasic" feature selection - SVC no lineal

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st
import sklearn.ensemble as en
import sklearn.feature_selection as fs
import sklearn.linear_model as lm
import sklearn.metrics as mt
import sklearn.model_selection as ms
import sklearn.preprocessing as pp
import sklearn.svm as svm
import sklearn.tree as tr
import time as tm
import xgboost as xgb
from collections import Counter
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [None]:
RANDOM_STATE=4+8+15+16+23+42
NUM_FEATURES_TO_SELECT=10
CROSS_VALIDATION_FOLDS=10#3,5,10
CROSS_VALIDATION_REPEATS=100#10, 100, 1000

In [None]:
# Cargar datos
X_raw = pd.read_csv("/content/drive/MyDrive/Documentos/7. educación/uniovi/master/tfm/data/LA_ID_12M_cont_0pad_PlusBasic_features.csv")
y_raw = pd.read_csv("/content/drive/MyDrive/Documentos/7. educación/uniovi/master/tfm/data/LA_ID_12M_cont_0pad_PlusBasic_labels.csv")
y = y_raw.iloc[:, 1:].values.ravel() # Las etiquetas están en la segunda columna

In [None]:
# Guardar los nombres de las características
original_feature_names = X_raw.columns.tolist()

# Preprocesamiento
pipe = Pipeline([
  ("feature_elimination", fs.VarianceThreshold()),
  ("scaler", pp.StandardScaler())
])
X_preprocessed = pipe.fit_transform(X_raw)
preprocessed_feature_names = pipe.get_feature_names_out(input_features=original_feature_names)
X = pd.DataFrame(X_preprocessed, columns=preprocessed_feature_names)

In [None]:
# Definir la estrategia de validación cruzada
cv_strategy = ms.RepeatedStratifiedKFold(n_splits = CROSS_VALIDATION_FOLDS,
                               n_repeats = CROSS_VALIDATION_REPEATS,
                               random_state=RANDOM_STATE)

In [None]:
mean_scores = []
best_score_overall = -1
best_features_overall = None
selected_features_per_fold_custom_rfe = []
feature_range = range(NUM_FEATURES_TO_SELECT, 20, 2)

# Tomar el tiempo inicial
start_time = tm.time()

for n_features_to_select in feature_range:
    fold_scores = []
    fold_features = []

    for fold, (train_index, test_index) in enumerate(cv_strategy.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        linear_model = svm.LinearSVC(random_state=RANDOM_STATE, max_iter=10000)
        linear_model.fit(X_train, y_train)
        feature_importances = np.abs(linear_model.coef_[0])
        sorted_feature_indices = np.argsort(feature_importances)[::-1]

        # Obtener las características
        selected_indices = sorted_feature_indices[:n_features_to_select]
        X_train_selected = X_train.iloc[:, selected_indices]
        selected_feature_names = X_train.columns[selected_indices].tolist()

        model = svm.SVC(kernel='rbf',
                        random_state=RANDOM_STATE,
                        max_iter=10000)# Necesario para que converja

        scores = cross_val_score(clone(model),
                                 X_train_selected,
                                 y_train,
                                 cv=CROSS_VALIDATION_FOLDS,
                                 scoring='roc_auc')
        fold_scores.append(np.mean(scores))
        fold_features.append(selected_feature_names)

    # Calcular la media en todos los folds
    mean_score_for_n_features = np.mean(fold_scores)
    mean_scores.append(mean_score_for_n_features)

    if mean_score_for_n_features > best_score_overall:
        best_score_overall = mean_score_for_n_features
        best_features_this_n = fold_features[np.argmax(fold_scores)]
        best_features_overall = best_features_this_n

# Tomar el tiempo final
end_time = tm.time()

print(f"*** Tiempo: {(end_time - start_time):.2f}")
print(f"*** Características: {len(best_features_overall)}")
for feature in best_features_overall[:NUM_FEATURES_TO_SELECT]:
    print(feature)
print("-" * 60)