In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import utility as ut

from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

## Read data
- randomly split data into training, validation and test set

In [None]:
df = pd.read_csv("data.csv")
X = df.drop("", axis=1)
y = df[""]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

## Data cleaning
- correct or drop obvious outliers
- impute categorical data: use most common value (maybe on some relevant subset)
- impute numerical data: use mean or median value (maybe on some relevant subset)

In [None]:
features_to_drop = []
features_dropper = ut.DropColumns(features_to_drop)

cleaning_pipeline = Pipeline([
    ("drop unneccesary features", features_dropper)
])

## Data Preprocessing
- do we need to log-scale skewed numerical features?
- do we need to scale numerical features?
- do we need to onehot encode categorical features?

In [None]:
skewed_numerical_features = []
log_transformer = ut.ApplyFunction(skewed_numerical_features, np.log1p)

numerical_features = make_column_selector(dtype_include=np.number)
numerical_transformer = Pipeline([
    ("simple impute", KNNImputer()),
    ("standard scale", StandardScaler())
])

categorical_features = make_column_selector(dtype_include=object)
categorical_transformer = Pipeline([
    ("one-hot encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ("simple impute", SimpleImputer(strategy="most_frequent")), 
])
column_transformer = make_column_transformer(
    (numerical_transformer, numerical_features),
    (categorical_transformer, categorical_features),
)

preprocessing_pipeline = Pipeline([
    ("log-scale skewed numerical features", log_transformer),
    ("transform columns", column_transformer)
])

## Model selection
- make sure to understand the metric you use based on which you select the model
- start with an easy and fast to train model and look how it performs
- if necessary use more advanced models

In [None]:
model = 
grid={}
model_cv = GridSearchCV(
    model,
    grid,
    cv=10,
)
model_pipeline = Pipeline([
    ("data cleaning", cleaning_pipeline),
    ("data preprocessing", preprocessing_pipeline),
    ("model", model_cv),
])

## Model prediction and validation
- train on whole training data set (train + validation set)
- test on test data set

In [None]:
model_pipeline.fit(X_train, y_train)
score = model_pipeline.score(X_train, y_train)