In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [4]:
#Cargar los ficheros de entrenamiento y de test
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
# Función para generar los splits según los folds existentes en el dataset de entrenamiento
def get_cv_iterable(folds, fold_column, train):
    for fold in folds:
        test_indexes = train[train[fold_column] == fold].index
        train_indexes = train[train[fold_column] != fold].index
        yield (train_indexes, test_indexes)

# Obtener los folds únicos
folds = train['fold'].unique()

In [8]:
#Escalar los datos de entrenamiento sin la columna "fold"
scaler = StandardScaler()
X_train = scaler.fit_transform(train.drop(columns=['is_anomaly', 'fold']))
y_train = train['is_anomaly']

X_test = scaler.fit_transform(test.drop(columns=['is_anomaly']))
y_test = test['is_anomaly']

In [10]:
# Definir los parámetros para GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],       # Parámetro de penalización
    'gamma': ['scale', 'auto', 0.1], 
    'kernel': ['rbf', 'linear']  # Tipos de kernel
}

# Crear el generador de splits con la función
cv_iterable = get_cv_iterable(folds=folds, fold_column='fold', train=train)

# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv_iterable, 
    n_jobs=-1, 
    verbose=1
)

# Entrenar el modelo SVM
grid_search.fit(X_train, y_train)

# Evaluar el mejor modelo en el conjunto de test
best_model = grid_search.best_estimator_

y_pred_test = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

Fitting 4 folds for each of 18 candidates, totalling 72 fits
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

Classification Report:
               precision    recall  f1-score   support

       False       0.99      1.00      1.00      1279
        True       0.98      0.86      0.92        74

    accuracy                           0.99      1353
   macro avg       0.99      0.93      0.96      1353
weighted avg       0.99      0.99      0.99      1353


Confusion Matrix:
 [[1278    1]
 [  10   64]]


In [12]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],  # Número de árboles
    'max_depth': [None, 10, 20],      # Profundidad máxima de cada árbol
    'min_samples_split': [2, 5, 10],  # Mínimo de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4]     # Mínimo de muestras en las hojas
}

cv_iterable = get_cv_iterable(folds=folds, fold_column='fold', train=train)

grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    scoring='accuracy',
    cv=cv_iterable,
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_test = best_rf_model.predict(X_test)

print("Best Parameters (Random Forest):", grid_search_rf.best_params_)
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf_test))
print("\nConfusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf_test))

Fitting 4 folds for each of 81 candidates, totalling 324 fits
Best Parameters (Random Forest): {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}

Classification Report (Random Forest):
               precision    recall  f1-score   support

       False       1.00      0.99      0.99      1279
        True       0.81      0.93      0.87        74

    accuracy                           0.98      1353
   macro avg       0.90      0.96      0.93      1353
weighted avg       0.99      0.98      0.98      1353


Confusion Matrix (Random Forest):
 [[1263   16]
 [   5   69]]


In [14]:
param_grid_et = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

cv_iterable = get_cv_iterable(folds=folds, fold_column='fold', train=train)

grid_search_et = GridSearchCV(
    estimator=ExtraTreesClassifier(random_state=42),
    param_grid=param_grid_et,
    scoring='accuracy',
    cv=cv_iterable,
    n_jobs=-1,
    verbose=1
)

grid_search_et.fit(X_train, y_train)

best_et_model = grid_search_et.best_estimator_
y_pred_et_test = best_et_model.predict(X_test)

print("Best Parameters (ExtraTrees):", grid_search_et.best_params_)
print("\nClassification Report (ExtraTrees):\n", classification_report(y_test, y_pred_et_test))
print("\nConfusion Matrix (ExtraTrees):\n", confusion_matrix(y_test, y_pred_et_test))

Fitting 4 folds for each of 16 candidates, totalling 64 fits
Best Parameters (ExtraTrees): {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Classification Report (ExtraTrees):
               precision    recall  f1-score   support

       False       1.00      0.99      0.99      1279
        True       0.83      0.92      0.87        74

    accuracy                           0.99      1353
   macro avg       0.91      0.95      0.93      1353
weighted avg       0.99      0.99      0.99      1353


Confusion Matrix (ExtraTrees):
 [[1265   14]
 [   6   68]]


In [16]:
param_grid_rs = {
    'n_estimators': [50, 100, 150], 
    'max_features': [0.5, 0.75, 1.0], 
    'estimator__max_depth': [None, 10, 20]  
}

cv_iterable = get_cv_iterable(folds=folds, fold_column='fold', train=train)

grid_search_rs = GridSearchCV(
    estimator=BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42),
    param_grid=param_grid_rs,
    scoring='accuracy',
    cv=cv_iterable,
    n_jobs=-1,
    verbose=1
)

grid_search_rs.fit(X_train, y_train)

best_rs_model = grid_search_rs.best_estimator_
y_pred_rs_test = best_rs_model.predict(X_test)

print("Best Parameters (Random Subspaces):", grid_search_rs.best_params_)
print("\nClassification Report (Random Subspaces):\n", classification_report(y_test, y_pred_rs_test))
print("\nConfusion Matrix (Random Subspaces):\n", confusion_matrix(y_test, y_pred_rs_test))

Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Parameters (Random Subspaces): {'estimator__max_depth': None, 'max_features': 0.5, 'n_estimators': 50}

Classification Report (Random Subspaces):
               precision    recall  f1-score   support

       False       0.99      0.98      0.99      1279
        True       0.74      0.86      0.80        74

    accuracy                           0.98      1353
   macro avg       0.87      0.92      0.89      1353
weighted avg       0.98      0.98      0.98      1353


Confusion Matrix (Random Subspaces):
 [[1257   22]
 [  10   64]]
