In [None]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV

TRAIN_PATH = "../data/train.csv"
TEST_PATH = "../data/test.csv"


In [None]:
train = pl.read_csv(TRAIN_PATH)
test = pl.read_csv(TEST_PATH)
skew_columns = [c for c in train.columns if "skew" in c]
train = train.select(pl.exclude(skew_columns))
test = test.select(pl.exclude(skew_columns))


In [None]:
test.group_by("is_anomaly").len()

In [None]:
# X, y = load_iris(return_X_y=True)
# clf = LogisticRegression(random_state=0).fit(X, y)
# clf.predict(X[:2, :])
def manual_fold_training(
        clf_model : LogisticRegression,
        data : pl.DataFrame,
        fold_number : int = 0,
        *,
        target_column : str = "is_anomaly"
):
    scores = {"fold":fold_number}
    
    fold_train = data.filter(pl.col("fold") != fold_number).drop("fold")
    fold_test = data.filter(pl.col("fold") == fold_number).drop("fold")
    X_train = (fold_train.select(pl.exclude(target_column))
               .select(pl.col(pl.Float64)).with_columns(
        pl.all().fill_nan(0)
    ).to_numpy())
    y_train =  fold_train.select(target_column).to_series().to_list()
    clf_model.fit(X_train,y_train)
    X_test = fold_test.select(pl.exclude(target_column)).to_numpy()
    y_test =  fold_test.select(pl.col(target_column)).to_series().to_numpy()
    y_pred = clf_model.predict(X_test)
    scores["f1_score"] = f1_score(y_test,y_pred)
    scores["recall"] = recall_score(y_test,y_pred)
    
    return scores

    
logistic_regressor = LogisticRegression(solver='liblinear',C=100.0,penalty="l2",max_iter=1000)

fold_scores = []
for fold in train["fold"].unique().to_list():
    fold_scores.append(manual_fold_training(clf_model=logistic_regressor,
                         data=train,
                         fold_number=fold))
    
    
df_scores = pl.DataFrame(fold_scores)
# mean_row = df_scores.select(pl.all().mean().cast(pl.Float64))
# df_with_mean = df_scores.with_columns(pl.col("fold").cast(pl.Utf8)).vstack(mean_row)
# df_with_mean

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

seed = 140421
np.random.seed(seed)

target_column = 'is_anomaly'
fold_column = 'fold'

X_train = train.select(pl.exclude(target_column)).to_pandas()
y_train =  train.select(target_column).to_series().to_numpy()

def get_cv_iterable(
  folds: list,
  fold_column: str,
  train: pd.DataFrame,
):
    print(train.columns)
    for fold in folds:
        test_indexes = train[train[fold_column] == fold].index
        train_indexes = train[train[fold_column] != fold].index
        yield (train_indexes, test_indexes)

# X_test = test_data[feature_columns]
# y_test = test_data[target_column]

# Set up cross-validation using the 'folds' column
folds = train["fold"].unique().to_list()
 
grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
cv = GridSearchCV(
  estimator = LogisticRegression(solver='liblinear'), 
  param_grid=grid,
  cv = get_cv_iterable(folds, fold_column, X_train),
  scoring=[
    "f1",
    "recall",
    "precision", 
    "accuracy"
  ],
  refit="f1",
)

cv.fit(X_train, y_train)
with pd.option_context("display.max_columns", 33):
  display(pd.DataFrame(cv.cv_results_))


In [None]:
pl.DataFrame(cv.cv_results_).select("params","mean_test_f1")

In [None]:
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
grid = {"estimator__criterion" : ["gini", "entropy"],
              "estimator__splitter" :   ["best", "random"],
              "n_estimators": [1, 2]
             }

DTC = DecisionTreeClassifier(random_state = seed)

cv2 = GridSearchCV(
    estimator = AdaBoostClassifier(estimator = DTC),
    param_grid=grid,
    cv = get_cv_iterable(folds, fold_column, X_train),
    scoring=[
        "f1",
        "recall",
        "precision",
        "accuracy"
    ],
    refit="f1",
)


cv2.fit(X_train, y_train)
with pd.option_context("display.max_columns", 33):
    display(pd.DataFrame(cv2.cv_results_))



In [None]:
pl.DataFrame(cv2.cv_results_).select("params","mean_test_f1")

In [None]:
predictions = logistic_regressor.predict(test.select([a for a in X_train.columns if "fold" != a]))
f1_score(test.select("is_anomaly").to_series().to_list(),predictions)

In [None]:
clfAda = AdaBoostClassifier(DecisionTreeClassifier(random_state = seed,criterion="entropy",splitter="random"),n_estimators=2).fit(X_train[[a for a in X_train.columns if "fold" != a]],y_train)


In [None]:
predictions = clfAda.predict(test.select([a for a in X_train.columns if "fold" != a]))
f1_score(test.select("is_anomaly").to_series().to_list(),predictions)