In [99]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV

TRAIN_PATH = "train.csv"

In [97]:
train = pl.read_csv(TRAIN_PATH)
skew_columns = [c for c in train.columns if "skew" in c]
train = train.select(pl.exclude(skew_columns))

In [110]:
# X, y = load_iris(return_X_y=True)
# clf = LogisticRegression(random_state=0).fit(X, y)
# clf.predict(X[:2, :])
def manual_fold_training(
        clf_model : LogisticRegression,
        data : pl.DataFrame,
        fold_number : int = 0,
        *,
        target_column : str = "is_anomaly"
):
    scores = {"fold":fold_number}
    
    fold_train = data.filter(pl.col("fold") != fold_number).drop("fold")
    fold_test = data.filter(pl.col("fold") == fold_number).drop("fold")
    
    X_train = (fold_train.select(pl.exclude(target_column))
               .select(pl.col(pl.Float64)).with_columns(
        pl.all().fill_nan(0)
    ).to_numpy())
    y_train =  fold_train.select(target_column).to_series().to_list()
    clf_model.fit(X_train,y_train)
    X_test = fold_test.select(pl.exclude(target_column)).to_numpy()
    y_test =  fold_test.select(pl.col(target_column)).to_series().to_numpy()
    y_pred = clf_model.predict(X_test)
    scores["f1_score"] = f1_score(y_test,y_pred)
    scores["recall"] = recall_score(y_test,y_pred)
    
    return scores

    
logistic_regressor = LogisticRegression(penalty="l2",max_iter=1000)

fold_scores = []
for fold in train["fold"].unique().to_list():
    fold_scores.append(manual_fold_training(clf_model=logistic_regressor,
                         data=train,
                         fold_number=fold))
    
    
df_scores = pl.DataFrame(fold_scores)
df_scores

fold,f1_score,recall
i64,f64,f64
0,0.740741,0.666667
1,0.666667,0.633333
2,0.802817,0.791667
3,0.888889,0.8


In [111]:
df_scores.select(pl.col("f1_score").mean(),pl.col("recall").mean())

f1_score,recall
f64,f64
0.774778,0.722917


In [117]:
train.group_by("is_anomaly","fold").len().sort("fold","is_anomaly")

is_anomaly,fold,len
bool,i64,u32
False,0,621
True,0,30
False,1,1245
True,1,30
False,2,2420
True,2,72
False,3,570
True,3,30
