In [None]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

TRAIN_PATH = "../data/train.csv"
TEST_PATH = "../data/test.csv"


In [None]:
train = pl.read_csv(TRAIN_PATH)
test = pl.read_csv(TEST_PATH)
skew_columns = [c for c in train.columns if "skew" in c]
train = train.select(pl.exclude(skew_columns))
test = test.select(pl.exclude(skew_columns))


In [None]:
X_train = train.select(pl.exclude("is_anomaly")).to_pandas()
y_train = train.select("is_anomaly").to_series().to_list()

X_test = test.select(pl.exclude("is_anomaly")).to_pandas()
y_test = test.select("is_anomaly").to_series().to_list()


In [None]:

import numpy as np
import pandas as pd

seed = 140421
np.random.seed(seed)

target_column = 'is_anomaly'
fold_column = 'fold'

def get_cv_iterable(
  folds: list,
  fold_column: str,
  train: pd.DataFrame,
):
    for fold in folds:
        test_indexes = train[train[fold_column] == fold].index
        train_indexes = train[train[fold_column] != fold].index
        yield (train_indexes, test_indexes)

# X_test = test_data[feature_columns]
# y_test = test_data[target_column]

# Set up cross-validation using the 'folds' column
folds = train["fold"].unique().to_list()
grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
cv = GridSearchCV(
  estimator = LogisticRegression(solver='liblinear'), 
  param_grid=grid,
  cv = get_cv_iterable(folds, fold_column, X_train),
  scoring=[
    "f1",
    "recall",
    "precision", 
    "accuracy"
  ],
  refit="f1",
)

cv.fit(X_train, y_train)
with pd.option_context("display.max_columns", 33):
  display(pd.DataFrame(cv.cv_results_))


In [None]:
best_params_logistic_regression = pl.DataFrame(cv.cv_results_).select("params","mean_test_f1").filter(pl.col("mean_test_f1") == pl.col("mean_test_f1").max())["params"][0]

In [83]:
logistic_regressor = LogisticRegression(**best_params_logistic_regression,max_iter=10000).fit(X_train[[a for a in X_train.columns if "fold" != a]],y_train)
predictions = logistic_regressor.predict(test.select([a for a in X_test.columns if "fold" != a]))
print("f1_score",f1_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("recall",recall_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("precision",precision_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("accuracy",accuracy_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("confusion matrix",confusion_matrix(test.select("is_anomaly").to_series().to_list(),predictions))


f1_score 0.9241379310344827
recall 0.9054054054054054
precision 0.9436619718309859
accuracy 0.991869918699187
confusion matrix [[1275    4]
 [   7   67]]


In [97]:
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
grid = {"estimator__criterion" : ["gini", "entropy"],
              "estimator__splitter" :   ["best","random"],
              "n_estimators": [1, 2]
             }

DTC = DecisionTreeClassifier(random_state = seed)

cv2 = GridSearchCV(
    estimator = AdaBoostClassifier(algorithm='SAMME',estimator = DTC,random_state=seed),
    param_grid=grid,
    cv = get_cv_iterable(folds, fold_column, X_train),
    scoring=[
        "f1",
        "recall",
        "precision",
        "accuracy"
    ],
    refit="f1",
)


cv2.fit(X_train, y_train)
with pd.option_context("display.max_columns", 33):
    display(pd.DataFrame(cv2.cv_results_))
pl.DataFrame(cv2.cv_results_).select("params","mean_test_f1").filter(pl.col("mean_test_f1") == pl.col("mean_test_f1").max())

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__criterion,param_estimator__splitter,param_n_estimators,params,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_test_recall,...,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.065289,0.005514,0.00475,0.0004331537,gini,best,1,"{'estimator__criterion': 'gini', 'estimator__s...",0.918033,0.912,0.88189,0.935484,0.911852,0.019329,7,0.903226,...,0.013373,7,0.933333,0.904762,0.861538,0.935484,0.908779,0.029849,7,0.992492,0.991742,0.988739,0.993994,0.991742,0.001914,7
1,0.085691,0.011279,0.009877,0.005435933,gini,best,2,"{'estimator__criterion': 'gini', 'estimator__s...",0.918033,0.912,0.88189,0.935484,0.911852,0.019329,7,0.903226,...,0.013373,7,0.933333,0.904762,0.861538,0.935484,0.908779,0.029849,7,0.992492,0.991742,0.988739,0.993994,0.991742,0.001914,7
2,0.009249,0.0013,0.005501,0.000865653,gini,random,1,"{'estimator__criterion': 'gini', 'estimator__s...",0.914729,0.951613,0.928,0.896552,0.922723,0.020071,3,0.951613,...,0.047024,3,0.880597,0.951613,0.920635,0.962963,0.928952,0.031928,3,0.991742,0.995495,0.993243,0.990991,0.992868,0.00172,3
3,0.010578,0.001591,0.006,7.609812e-07,gini,random,2,"{'estimator__criterion': 'gini', 'estimator__s...",0.914729,0.951613,0.928,0.896552,0.922723,0.020071,3,0.951613,...,0.047024,3,0.880597,0.951613,0.920635,0.962963,0.928952,0.031928,3,0.991742,0.995495,0.993243,0.990991,0.992868,0.00172,3
4,0.044445,0.009923,0.005251,0.00108992,entropy,best,1,"{'estimator__criterion': 'entropy', 'estimator...",0.936508,0.943089,0.914729,0.881356,0.91892,0.024094,5,0.951613,...,0.047024,3,0.921875,0.95082,0.880597,0.928571,0.920466,0.02539,5,0.993994,0.994745,0.991742,0.989489,0.992492,0.002056,5
5,0.03766,0.002663,0.004781,0.0005068152,entropy,best,2,"{'estimator__criterion': 'entropy', 'estimator...",0.936508,0.943089,0.914729,0.881356,0.91892,0.024094,5,0.951613,...,0.047024,3,0.921875,0.95082,0.880597,0.928571,0.920466,0.02539,5,0.993994,0.994745,0.991742,0.989489,0.992492,0.002056,5
6,0.008249,0.000433,0.00475,0.0004334287,entropy,random,1,"{'estimator__criterion': 'entropy', 'estimator...",0.944882,0.952381,0.933333,0.933333,0.940982,0.008096,1,0.967742,...,0.032258,1,0.923077,0.9375,0.965517,0.965517,0.947903,0.018338,1,0.994745,0.995495,0.993994,0.993994,0.994557,0.000622,1
7,0.00725,0.000829,0.00475,0.0004331192,entropy,random,2,"{'estimator__criterion': 'entropy', 'estimator...",0.944882,0.952381,0.933333,0.933333,0.940982,0.008096,1,0.967742,...,0.032258,1,0.923077,0.9375,0.965517,0.965517,0.947903,0.018338,1,0.994745,0.995495,0.993994,0.993994,0.994557,0.000622,1


params,mean_test_f1
struct[3],f64
"{""entropy"",""random"",1}",0.940982
"{""entropy"",""random"",2}",0.940982


In [94]:
best_params_ada_boost = pl.DataFrame(cv2.cv_results_).select("params","mean_test_f1").filter(pl.col("mean_test_f1") == pl.col("mean_test_f1").max())["params"][0]
best_params_ada_boost = {a.split("_")[-1]:b for a,b in best_params_ada_boost.items() if "estimator__" in a}

In [98]:
clfAda = AdaBoostClassifier(algorithm='SAMME',estimator=DecisionTreeClassifier(**best_params_ada_boost),n_estimators=2,random_state=seed).fit(X_train[[a for a in X_train.columns if "fold" != a]],y_train)

predictions = clfAda.predict(test.select([a for a in X_test.columns if "fold" != a]))
print("f1_score",f1_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("recall",recall_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("precision",precision_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("accuracy",accuracy_score(test.select("is_anomaly").to_series().to_list(),predictions))
print("confusion matrix",confusion_matrix(test.select("is_anomaly").to_series().to_list(),predictions))

f1_score 0.9115646258503401
recall 0.9054054054054054
precision 0.9178082191780822
accuracy 0.9903917220990391
confusion matrix [[1273    6]
 [   7   67]]
