# Data Exploration

In [None]:
%load_ext autoreload
%autoreload 2

from assaiku.data import DataConfig

from assaiku.model.configs import EvaluationConfig
from assaiku.model.evaluation import evaluate_model
from assaiku.model.processors import (
    fit_processor,
    initialize_feat_processor,
    split_transform,
)
from assaiku.model.train import initialize_model, train_model
from assaiku.model.evaluation import analyze_data
from assaiku.model.configs import LogisticRegressionConfig, EvaluationConfig, LinearSVMConfig, XGBConfig
from assaiku.model import MLPipe

import pandas as pd

pd.set_option('display.max_columns', 50)

data_config = DataConfig(perform_exploration=True)
model_config = XGBConfig(n_estimators=100,
        max_depth=7,
        learning_rate=1e-1,
        dimension_red=None)

## Loading data generated by Data Pipeline

In [None]:
train_data = pd.read_parquet(path=data_config.train_data_out)
test_data = pd.read_parquet(path=data_config.test_data_out)

## Initiliaze the data processor and fit it

In [None]:
feat_processor, label_binarizer = initialize_feat_processor(
                data_config=data_config, model_config=model_config
            )
fit_processor(
    train_data=train_data,
    feature_cols=data_config.features,
    pipeline=feat_processor,
)

## Split and transform data for each data set

In [None]:
x_train, y_train, w_train = split_transform(
                train_data,
                feat_processor,
                label_binarizer,
                data_config=data_config,
            )
x_test, y_test, w_test = split_transform(
    test_data,
    feat_processor,
    label_binarizer,
    data_config=data_config,
)

## Train model

In [None]:
model = initialize_model(model_config=model_config)

train_model(
    model_config=model_config,
    x_train=x_train,
    y_train=y_train,
    weights=w_train,
    model=model,
)

## Evaluate model

In [None]:
train_perf_0, train_perf_1 = evaluate_model(
    model=model,
    x=x_train,
    y=y_train,
    weights=w_train,
    data_set="train",
    model_name=model_config.name,
)

test_perf_0, test_perf_1 = evaluate_model(
    model=model,
    x=x_test,
    y=y_test,
    weights=w_test,
    data_set="test",
    model_name=model_config.name,
)

## Compare different models in one line

In [None]:
# Specify your evaluation config here
eval_config = EvaluationConfig(n_repet=2,
                            model_configs=[
                                XGBConfig(n_estimators=100,
                                            max_depth=7,
                                            learning_rate=1e-1,
                                            dimension_red=None),
                                XGBConfig(weight_neg_factor=1, 
                                            weight_pos_factor=1,
                                            dimension_red=50),
                                LinearSVMConfig(rbf_gamma=5e-5, C=100),
                                LinearSVMConfig(),
                                LogisticRegressionConfig(),
                                LogisticRegressionConfig(dimension_red=50),
                            ])

ml_pipeline = MLPipe(data_config=data_config,
                     evaluation_config=eval_config)

data = ml_pipeline.run()