In [185]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV

TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"


In [186]:
train = pl.read_csv(TRAIN_PATH)
test = pl.read_csv(TEST_PATH)
skew_columns = [c for c in train.columns if "skew" in c]
train = train.select(pl.exclude(skew_columns))
test = test.select(pl.exclude(skew_columns))


In [187]:
test.group_by("is_anomaly").len()

is_anomaly,len
bool,u32
False,882
True,30


In [None]:
# X, y = load_iris(return_X_y=True)
# clf = LogisticRegression(random_state=0).fit(X, y)
# clf.predict(X[:2, :])
def manual_fold_training(
        clf_model : LogisticRegression,
        data : pl.DataFrame,
        fold_number : int = 0,
        *,
        target_column : str = "is_anomaly"
):
    scores = {"fold":fold_number}
    
    fold_train = data.filter(pl.col("fold") != fold_number).drop("fold")
    fold_test = data.filter(pl.col("fold") == fold_number).drop("fold")
    X_train = (fold_train.select(pl.exclude(target_column))
               .select(pl.col(pl.Float64)).with_columns(
        pl.all().fill_nan(0)
    ).to_numpy())
    y_train =  fold_train.select(target_column).to_series().to_list()
    clf_model.fit(X_train,y_train)
    X_test = fold_test.select(pl.exclude(target_column)).to_numpy()
    y_test =  fold_test.select(pl.col(target_column)).to_series().to_numpy()
    y_pred = clf_model.predict(X_test)
    scores["f1_score"] = f1_score(y_test,y_pred)
    scores["recall"] = recall_score(y_test,y_pred)
    
    return scores

    
logistic_regressor = LogisticRegression(solver='liblinear',C=100.0,penalty="l2",max_iter=1000)

fold_scores = []
for fold in train["fold"].unique().to_list():
    fold_scores.append(manual_fold_training(clf_model=logistic_regressor,
                         data=train,
                         fold_number=fold))
    
    
df_scores = pl.DataFrame(fold_scores)
# mean_row = df_scores.select(pl.all().mean().cast(pl.Float64))
# df_with_mean = df_scores.with_columns(pl.col("fold").cast(pl.Utf8)).vstack(mean_row)
# df_with_mean
mean_row

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

seed = 140421
np.random.seed(seed)

target_column = 'is_anomaly'
fold_column = 'fold'

X_train = train.select(pl.exclude(target_column)).to_pandas()
y_train =  train.select(target_column).to_series().to_numpy()

def get_cv_iterable(
  folds: list,
  fold_column: str,
  train: pd.DataFrame,
):
    print(train.columns)
    for fold in folds:
        test_indexes = train[train[fold_column] == fold].index
        train_indexes = train[train[fold_column] != fold].index
        yield (train_indexes, test_indexes)

# X_test = test_data[feature_columns]
# y_test = test_data[target_column]

# Set up cross-validation using the 'folds' column
folds = train["fold"].unique().to_list()
 
grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
cv = GridSearchCV(
  estimator = LogisticRegression(solver='liblinear'), 
  param_grid=grid,
  cv = get_cv_iterable(folds, fold_column, X_train),
  scoring=[
    "f1",
    "recall",
    "precision", 
    "accuracy"
  ],
  refit="f1",
)

cv.fit(X_train, y_train)
with pd.option_context("display.max_columns", 33):
  display(pd.DataFrame(cv.cv_results_))


In [None]:
pl.DataFrame(cv.cv_results_).select("params","mean_test_f1")

In [181]:
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
grid = {"estimator__criterion" : ["gini", "entropy"],
              "estimator__splitter" :   ["best", "random"],
              "n_estimators": [1, 2]
             }

DTC = DecisionTreeClassifier(random_state = seed)

cv2 = GridSearchCV(
    estimator = AdaBoostClassifier(estimator = DTC),
    param_grid=grid,
    cv = get_cv_iterable(folds, fold_column, X_train),
    scoring=[
        "f1",
        "recall",
        "precision",
        "accuracy"
    ],
    refit="f1",
)


cv2.fit(X_train, y_train)
with pd.option_context("display.max_columns", 33):
    display(pd.DataFrame(cv2.cv_results_))



Index(['TP2_mean', 'TP2_max', 'TP2_min', 'TP2_median', 'TP2_var', 'TP3_mean',
       'TP3_max', 'TP3_min', 'TP3_median', 'TP3_var', 'H1_mean', 'H1_max',
       'H1_min', 'H1_median', 'H1_var', 'DV_pressure_mean', 'DV_pressure_max',
       'DV_pressure_min', 'DV_pressure_median', 'DV_pressure_var',
       'Reservoirs_mean', 'Reservoirs_max', 'Reservoirs_min',
       'Reservoirs_median', 'Reservoirs_var', 'Oil_temperature_mean',
       'Oil_temperature_max', 'Oil_temperature_min', 'Oil_temperature_median',
       'Oil_temperature_var', 'Motor_current_mean', 'Motor_current_max',
       'Motor_current_min', 'Motor_current_median', 'Motor_current_var',
       'COMP_mean', 'COMP_max', 'COMP_min', 'COMP_median', 'COMP_var',
       'DV_eletric_mean', 'DV_eletric_max', 'DV_eletric_min',
       'DV_eletric_median', 'DV_eletric_var', 'Towers_mean', 'Towers_max',
       'Towers_min', 'Towers_median', 'Towers_var', 'MPG_mean', 'MPG_max',
       'MPG_min', 'MPG_median', 'MPG_var', 'LPS_mean', 'LPS_m



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__criterion,param_estimator__splitter,param_n_estimators,params,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_test_recall,...,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.096525,0.021398,0.00847,0.000884,gini,best,1,"{'estimator__criterion': 'gini', 'estimator__s...",0.836364,0.861538,0.72956,0.909091,0.834138,0.065784,4,0.766667,...,0.061661,5,0.92,0.8,0.666667,1.0,0.846667,0.125963,4,0.986175,0.992941,0.982745,0.991667,0.988382,0.00413,4
1,0.093657,0.020686,0.007682,0.000424,gini,best,2,"{'estimator__criterion': 'gini', 'estimator__s...",0.8,0.861538,0.721519,0.928571,0.827907,0.076423,5,0.733333,...,0.075548,6,0.88,0.8,0.662791,1.0,0.835698,0.122606,5,0.983103,0.992941,0.982343,0.993333,0.98793,0.005216,5
2,0.012429,0.001025,0.007755,0.000426,gini,random,1,"{'estimator__criterion': 'gini', 'estimator__s...",0.814815,0.811594,0.899329,0.912281,0.859505,0.04654,2,0.733333,...,0.08109,3,0.916667,0.717949,0.87013,0.962963,0.866927,0.092062,2,0.984639,0.989804,0.993981,0.991667,0.990023,0.003442,3
3,0.01102,0.001593,0.007252,0.000432,gini,random,2,"{'estimator__criterion': 'gini', 'estimator__s...",0.814815,0.861538,0.887324,0.947368,0.877761,0.047859,1,0.733333,...,0.076234,4,0.916667,0.8,0.9,1.0,0.904167,0.071078,1,0.984639,0.992941,0.993579,0.995,0.99154,0.004053,1
4,0.040762,0.007445,0.00751,0.00052,entropy,best,1,"{'estimator__criterion': 'entropy', 'estimator...",0.807692,0.675325,0.873239,0.877193,0.808362,0.081618,8,0.7,...,0.067743,7,0.954545,0.553191,0.885714,0.925926,0.829844,0.161586,7,0.984639,0.980392,0.992777,0.988333,0.986535,0.00457,8
5,0.041161,0.007161,0.007915,0.000146,entropy,best,2,"{'estimator__criterion': 'entropy', 'estimator...",0.814815,0.716418,0.849315,0.877193,0.814435,0.060751,7,0.733333,...,0.04769,8,0.916667,0.648649,0.837838,0.925926,0.83227,0.111402,6,0.984639,0.985098,0.991172,0.988333,0.987311,0.002645,6
6,0.009745,0.000833,0.007667,0.000407,entropy,random,1,"{'estimator__criterion': 'entropy', 'estimator...",0.836364,0.8,0.835443,0.965517,0.859331,0.063035,3,0.766667,...,0.070094,1,0.92,0.7,0.767442,1.0,0.84686,0.119034,3,0.986175,0.98902,0.989567,0.996667,0.990357,0.003864,2
7,0.009782,0.001129,0.007779,0.000873,entropy,random,2,"{'estimator__criterion': 'entropy', 'estimator...",0.75,0.710526,0.853659,0.947368,0.815388,0.092405,6,0.7,...,0.101408,2,0.807692,0.586957,0.76087,1.0,0.78888,0.14704,8,0.978495,0.982745,0.990369,0.995,0.986652,0.006429,7


In [182]:
pl.DataFrame(cv2.cv_results_).select("params","mean_test_f1")

params,mean_test_f1
struct[3],f64
"{""gini"",""best"",1}",0.834138
"{""gini"",""best"",2}",0.827907
"{""gini"",""random"",1}",0.859505
"{""gini"",""random"",2}",0.877761
"{""entropy"",""best"",1}",0.808362
"{""entropy"",""best"",2}",0.814435
"{""entropy"",""random"",1}",0.859331
"{""entropy"",""random"",2}",0.815388


In [194]:
predictions = logistic_regressor.predict(test.select([a for a in X_train.columns if "fold" != a]))
f1_score(test.select("is_anomaly").to_series().to_list(),predictions)



0.7857142857142857

In [206]:
clfAda = AdaBoostClassifier(DecisionTreeClassifier(random_state = seed,criterion="gini",splitter="random"),n_estimators=2).fit(X_train[[a for a in X_train.columns if "fold" != a]],y_train)




In [207]:
predictions = clfAda.predict(test.select([a for a in X_train.columns if "fold" != a]))
f1_score(test.select("is_anomaly").to_series().to_list(),predictions)

0.8275862068965517