In [None]:
import pandas as pd
import os
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import numpy as np
from AlvaroSantanaSanchez.utils import EnsembleFilter

TRAIN_PATH = "../data/train.csv"
TEST_PATH = "../data/test.csv"
print(os.getcwd())

In [174]:
#Valores encontrados durante ajuse de hiperparametros AdaBoost
best_params_ada_boost,best_params_CT = ({'learning_rate': 1, 'n_estimators': 200},
 {'criterion': 'gini',
  'max_depth': 3,
  'min_samples_leaf': 5,
  'min_samples_split': 2,
  'splitter': 'best'})

## Carga del dataset y generación de particiones

In [175]:
train = pl.read_csv(TRAIN_PATH)
test = pl.read_csv(TEST_PATH)
skew_columns = [c for c in train.columns if "skew" in c]
train = train.select(pl.exclude(skew_columns))
test = test.select(pl.exclude(skew_columns))


In [176]:

X_train = train.select(pl.exclude("is_anomaly")).to_pandas()
y_train = train.select("is_anomaly").to_series().to_list()

X_test = test.select(pl.exclude("is_anomaly")).to_pandas()
y_test = test.select("is_anomaly").to_series().to_list()


## Eliminacion del ruido

Buscamos eliminar ruido de los casos no anomalos.

In [177]:
X_train_clean, y_train_clean,noisy_indice = EnsembleFilter(X_train.to_numpy(), np.array(y_train, dtype=np.float32), k=5, voting='consensus')

clean_train = train.with_row_index().filter((~pl.col("index").is_in(noisy_indice))).select(pl.exclude("index"))
#Comprobamos si se han filtrado algunas instancias
pl.concat([
    train.group_by("is_anomaly").len().with_columns(pl.lit("original").alias("dataframe")),
    clean_train.group_by("is_anomaly").len().with_columns(pl.lit("limpio").alias("dataframe"))
]).sort(by=["dataframe","is_anomaly"])


is_anomaly,len,dataframe
bool,u32,str
False,4443,"""limpio"""
True,209,"""limpio"""
False,4445,"""original"""
True,217,"""original"""


In [186]:
#Eliminado el ruido volvemos a cargar el conjunto de entrenamiento
X_train = clean_train.select(pl.exclude("is_anomaly")).to_pandas()
y_train = clean_train.select("is_anomaly").to_series().to_list()

In [203]:
from sklearn.preprocessing import StandardScaler

Scaler = StandardScaler(with_mean=True, with_std=False)
X_scaled = Scaler.fit_transform(X=X_train [[a for a in X_train.columns if "fold" != a]])

In [204]:
X_scaled = pd.DataFrame(X_scaled)
X_scaled["fold"] = X_train["fold"]
X_scaled.columns = X_train.columns

# Ajuste de hiperparametros LogisticRegresion

In [209]:
seed = 140421
np.random.seed(seed)

target_column = 'is_anomaly'
fold_column = 'fold'


def get_cv_iterable(
        folds: list,
        fold_column: str,
        train: pd.DataFrame,
):
    for fold in folds:
        test_indexes = train[train[fold_column] == fold].index
        train_indexes = train[train[fold_column] != fold].index
        yield (train_indexes, test_indexes)


# X_test = test_data[feature_columns]
# y_test = test_data[target_column]

# Set up cross-validation using the 'folds' column
folds = train["fold"].unique().to_list()
grid = {
        "C": np.logspace(-3, 3, 7),
        "penalty": ["l2","l1"], 
        'class_weight': ['balanced', {0: 1, 1: 10}, {0: 1, 1: 20}]
        }
cv = GridSearchCV(
    estimator=LogisticRegression(solver='liblinear',max_iter=10000),
    param_grid=grid,
    cv=get_cv_iterable(folds, fold_column, X_train),
    scoring=[
        "f1",
        "recall",
        "precision",
        "accuracy"
    ],
    refit="f1",
)


cv.fit(X_scaled, y_train)


In [219]:
with pd.option_context("display.max_columns", 33):
    display(pd.DataFrame(cv.cv_results_)["mean_test_f1"].max())


0.9504781420765027

In [211]:
cv_results_df = pd.DataFrame(cv.cv_results_)
best_params_row = cv_results_df.loc[cv_results_df["mean_test_f1"].idxmax()]
best_params_logistic_regression = best_params_row["params"]
best_params_logistic_regression

{'C': 10.0, 'class_weight': {0: 1, 1: 20}, 'penalty': 'l1'}

In [212]:
logistic_regressor = LogisticRegression(solver="liblinear",**best_params_logistic_regression, max_iter=10000).fit(
    X_scaled[[a for a in X_scaled.columns if "fold" != a]], y_train)
test_scaled = pd.DataFrame(Scaler.transform(X_test))
test_scaled.columns = X_test.columns 
predictions = logistic_regressor.predict(test_scaled)
print("f1_score", f1_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("recall", recall_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("precision", precision_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("accuracy", accuracy_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("confusion matrix", confusion_matrix(test.select("is_anomaly").to_series().to_list(), predictions))


f1_score 0.9103448275862069
recall 0.8918918918918919
precision 0.9295774647887324
accuracy 0.9903917220990391
confusion matrix [[1274    5]
 [   8   66]]


In [138]:
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier

grid = {"estimator__criterion": ["gini", "entropy"],
        "estimator__splitter": ["best", "random"],
        "estimator__max_depth": [3, 5, 10],     # Limit tree depth to avoid overfitting
        "estimator__min_samples_split": [2, 5, 10],   # Minimum samples to split an internal node
        "estimator__min_samples_leaf": [1, 2, 5],     # Minimum samples required in a leaf node
        "n_estimators": [1,2,10, 50, 100, 200],           # Number of weak learners
        "learning_rate": [0.01, 0.1, 0.5, 1], 
        }

DTC = DecisionTreeClassifier(random_state=seed)

cv2 = GridSearchCV(
    estimator=AdaBoostClassifier(algorithm='SAMME', estimator=DTC, random_state=seed),
    param_grid=grid,
    cv=get_cv_iterable(folds, fold_column, X_train),
    scoring=[
        "f1",
        "recall",
        "precision",
        "accuracy"
    ],
    refit="f1",
)

cv2.fit(X_train, y_train)

In [225]:
with pd.option_context("display.max_columns", 33):
    display(pd.DataFrame(cv2.cv_results_))
pl.DataFrame(cv2.cv_results_)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__criterion,param_estimator__max_depth,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__splitter,param_learning_rate,param_n_estimators,params,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,...,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.026750,0.001922,0.004750,0.000829,gini,3,1,2,best,0.01,1,"{'estimator__criterion': 'gini', 'estimator__m...",0.945455,0.921739,0.966102,0.949153,...,0.026933,2123,1.000000,0.963636,0.982759,0.965517,0.977978,0.014738,139,0.995479,0.993223,0.996992,0.995489,0.995296,0.001346,1719
1,0.057250,0.016605,0.006251,0.003344,gini,3,1,2,best,0.01,2,"{'estimator__criterion': 'gini', 'estimator__m...",0.945455,0.921739,0.966102,0.949153,...,0.026933,2123,1.000000,0.963636,0.982759,0.965517,0.977978,0.014738,139,0.995479,0.993223,0.996992,0.995489,0.995296,0.001346,1719
2,0.265122,0.030197,0.005750,0.000433,gini,3,1,2,best,0.01,10,"{'estimator__criterion': 'gini', 'estimator__m...",0.954955,0.983333,0.966102,0.949153,...,0.025517,1514,1.000000,0.983333,0.982759,0.965517,0.982902,0.012194,35,0.996232,0.998494,0.996992,0.995489,0.996802,0.001112,248
3,1.206763,0.110663,0.011001,0.001225,gini,3,1,2,best,0.01,50,"{'estimator__criterion': 'gini', 'estimator__m...",0.964286,0.975207,0.941176,0.931034,...,0.029853,1760,1.000000,0.967213,0.949153,0.964286,0.970163,0.018540,578,0.996986,0.997741,0.994737,0.993985,0.995862,0.001548,1206
4,2.419690,0.197186,0.016507,0.002065,gini,3,1,2,best,0.01,100,"{'estimator__criterion': 'gini', 'estimator__m...",0.955752,0.975207,0.941176,0.931034,...,0.029853,1760,0.981818,0.967213,0.949153,0.964286,0.965617,0.011596,962,0.996232,0.997741,0.994737,0.993985,0.995674,0.001442,1423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2587,0.009985,0.001237,0.004493,0.000546,entropy,10,5,10,random,1,2,"{'estimator__criterion': 'entropy', 'estimator...",0.907563,0.957983,0.910569,0.983051,...,0.014364,1499,0.885246,0.966102,0.888889,1.000000,0.935059,0.049482,2401,0.991711,0.996235,0.991729,0.998496,0.994543,0.002934,2102
2588,0.043751,0.005429,0.005772,0.000417,entropy,10,5,10,random,1,10,"{'estimator__criterion': 'entropy', 'estimator...",0.965517,0.967213,0.957265,0.947368,...,0.031821,1422,0.965517,0.951613,0.982456,1.000000,0.974897,0.018148,252,0.996986,0.996988,0.996241,0.995489,0.996426,0.000621,662
2589,0.227395,0.031820,0.011181,0.001412,entropy,10,5,10,random,1,50,"{'estimator__criterion': 'entropy', 'estimator...",0.973913,0.967213,0.957265,0.965517,...,0.021490,793,0.982456,0.951613,0.982456,1.000000,0.979131,0.017428,62,0.997739,0.996988,0.996241,0.996992,0.996990,0.000530,132
2590,0.466886,0.065968,0.017516,0.002703,entropy,10,5,10,random,1,100,"{'estimator__criterion': 'entropy', 'estimator...",0.965517,0.967213,0.957265,0.965517,...,0.021490,793,0.965517,0.951613,0.982456,1.000000,0.974897,0.018148,252,0.996986,0.996988,0.996241,0.996992,0.996802,0.000324,253


TypeError: unexpected value while building Series of type Float64; found value of type Int64: 1

Hint: Try setting `strict=False` to allow passing data with mixed types.

In [227]:
cv_results_df = pd.DataFrame(cv2.cv_results_)
best_params_row = cv_results_df.loc[cv_results_df["mean_test_f1"].idxmax()]
params = best_params_row["params"]
best_params_CT = {a.split("estimator__")[-1]: b for a, b in params.items() if "estimator__" in a}
best_params_ada_boost = {a: b for a, b in best_params_ada_boost.items() if "estimator__" not in a}
best_params_ada_boost,best_params_CT

({'learning_rate': 1, 'n_estimators': 200},
 {'criterion': 'gini',
  'max_depth': 3,
  'min_samples_leaf': 5,
  'min_samples_split': 2,
  'splitter': 'best'})

In [229]:
clfAda = AdaBoostClassifier(algorithm='SAMME',**best_params_ada_boost,estimator=DecisionTreeClassifier(**best_params_CT), random_state=seed).fit(X_train[[a for a in X_train.columns if "fold" != a]],
                                                                   y_train)
predictions = clfAda.predict(test.select([a for a in X_test.columns if "fold" != a]))
print("f1_score", f1_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("recall", recall_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("precision", precision_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("accuracy", accuracy_score(test.select("is_anomaly").to_series().to_list(), predictions))
print("confusion matrix", confusion_matrix(test.select("is_anomaly").to_series().to_list(), predictions))

f1_score 0.9379310344827586
recall 0.918918918918919
precision 0.9577464788732394
accuracy 0.9933481152993349
confusion matrix [[1276    3]
 [   6   68]]
