In [None]:
import os
os.chdir("..")

In [None]:
import fastai
from pathlib import Path
from typing import List, Tuple, Union
from sklearn.base import TransformerMixin
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from utils.io import load_data
from fastai.tabular.all import *

In [None]:
X_train, y_train, X_valid, y_valid, _, _ = load_data(Path("resources/data/transformed_numerical_columns"))

In [None]:
embedder_n_epochs = 5
embedder_num_layers = 3
target_name = "loan_status"
valid_sample_frac = .3

In [None]:
ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
df_train = pd.concat([X_train, y_train], axis=1)
categorical_columns = X_train.select_dtypes(
    exclude=["float64"]
).columns.values.tolist()
continuous_columns = X_train.select_dtypes(
    include=["float64"]
).columns.values.tolist()

In [None]:
tabular_pandas = TabularDataLoaders.from_df(
    df_train,
    procs=[fastai.tabular.all.Categorify],
    cat_names=categorical_columns,
    cont_names=continuous_columns,
    y_names=target_name,
    batchsize=2048,
    drop_last=False,
    valid_idx=df_train.sample(frac=valid_sample_frac, random_state=42).index,
)

In [None]:
def get_default_nn_layers(
    num_embeddings: int,
    num_continuous: int,
    num_outputs: int,
    num_layers: int = 2,
) -> List[int]:
    num_input_nodes = num_embeddings + num_continuous
    first_layer = 2 ** (num_layers - 1) * round(
        (((2 / 3) * num_input_nodes) + num_outputs) / 2 ** (num_layers - 1)
    )

    return [first_layer] + [
        int(first_layer / 2 ** n) for n in range(1, num_layers)
    ]

In [None]:
num_embeddings = sum(
    n for _, n in get_emb_sz(tabular_pandas)
)
num_classes = tabular_pandas.ys.nunique().values[0]
continuous_columns = tabular_pandas.cont_names
layers = get_default_nn_layers(
    num_embeddings,
    num_continuous=len(continuous_columns),
    num_outputs=num_classes,
    num_layers=embedder_num_layers,
)
config = fastai.tabular.all.tabular_config(
    ps=[0.001] + (embedder_num_layers - 1) * [0.01], embed_p=0.04
)

nn_model = fastai.tabular.all.tabular_learner(
    dls=tabular_pandas,
    layers=layers,
    config=config,
    loss_func=fastai.tabular.all.CrossEntropyLossFlat(),
    metrics=RocAucBinary(),
    n_out=num_classes,
)

In [None]:
valley = nn_model.lr_find()

In [None]:
nn_model.fit_one_cycle(n_epoch=embedder_n_epochs, lr_max=valley)