# Train and save a sklearn.linear_model.LinearRegression Model on the adult dataset
## Use an even dataset split into 4 parts

## Default Values for Papermill Parameters

In [None]:
PARAM_SEED = 0
PARAM_DATA_IN_PATH = "../../data"

PARAM_DATASET_NAME = "OpenML Adult"
PARAM_MODEL_NAME = "model"
PARAM_MODEL_TYPE = "sklearn_linear_model_LinearRegression"

PARAM_HIDDEN_LAYER_SIZES = [100, 100, 100]

## Prepare constant variables that will be used throughout the notebook.

In [None]:
from subroc import util
import numpy as np
from matplotlib import pyplot as plt

from subroc.model_serialization import serialize, to_ModelName
from subroc.model_abstraction import to_ModelType, instantiate
from subroc.datasets.metadata import to_DatasetName
from subroc.datasets.reader import DatasetReader, DatasetStage

import os

# fill environment variables into params
PARAM_DATA_IN_PATH = util.prepend_experiment_output_path(PARAM_DATA_IN_PATH)

# set constants
STAGE_OUTPUT_PATH = os.environ.get("STAGE_OUTPUT_PATH", "../..")

DATASET_READER = DatasetReader(PARAM_DATA_IN_PATH)
DATASET_NAME = to_DatasetName(PARAM_DATASET_NAME)

DATA_OUT_PATH = f"{STAGE_OUTPUT_PATH}/data/processed"
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

MODELS_OUT_PATH = f"{STAGE_OUTPUT_PATH}/models"
MODEL_NAME = to_ModelName(PARAM_MODEL_NAME)

MODEL_TYPE = to_ModelType(PARAM_MODEL_TYPE)

## Prepare the Data and Save the Preprocessing Result

In [None]:
from subroc.preprocessing import preprocessing_functions

data, dataset_meta = DATASET_READER.read_dataset(DATASET_NAME, DatasetStage.RAW)
data, indices_1, indices_2, indices_3, indices_4 = preprocessing_functions[MODEL_NAME](data, dataset_meta, PARAM_SEED)

# split data
data_x = data.loc[:, data.columns != dataset_meta.gt_name]
data_y = data[dataset_meta.gt_name]

train_x = data_x.iloc[indices_1]
train_y = data_y.iloc[indices_1]
test_x = data_x.iloc[indices_2]
test_y = data_y.iloc[indices_2]

print(f"train split size: {len(train_y)}")
print(f"test split size: {len(test_y)}")
print(f"significance holdout split size: {len(indices_3)}")
print(f"generalizability holdout split size: {len(indices_4)}")

# save preprocessed data
out_path = DATA_OUT_PATH + "/" + dataset_meta.dataset_dir
if not os.path.exists(out_path):
    os.mkdir(out_path)

data.iloc[indices_1].to_csv(out_path + "/" + "model_ready_train.csv", index=False)
data.iloc[indices_2].to_csv(out_path + "/" + "model_ready_test.csv", index=False)
data.iloc[indices_3].to_csv(out_path + "/" + "model_ready_holdout_significance.csv", index=False)
data.iloc[indices_4].to_csv(out_path + "/" + "model_ready_holdout_generalizability.csv", index=False)

## Train model

In [None]:
# define model type
model = instantiate(
    MODEL_TYPE,
    train_x, train_y,
    seed=PARAM_SEED,
    classes=np.unique(np.append(train_y, test_y).astype("int64")),
    sklearn_mlpclassifier_hidden_layer_sizes=PARAM_HIDDEN_LAYER_SIZES)

## Evaluate

In [None]:
from sklearn.metrics import RocCurveDisplay, precision_recall_curve
from subroc.util import print_metric_colored
from subroc.quality_functions.sklearn_metrics import soft_classification_metrics
from termcolor import cprint

test_predictions = model.predict(test_x)

for metric in soft_classification_metrics:
    try:
        test_y_numpy = test_y.to_numpy()
        metric_value = metric(test_y_numpy, test_predictions)
        print_metric_colored(metric.__name__, metric_value)   
    except ValueError:
        cprint(f"{metric.__name__}: ValueError", color="red")

RocCurveDisplay.from_predictions(test_y, test_predictions)
plt.title("ROC Curve")
plt.show()

precision, recall, _ = precision_recall_curve(test_y, test_predictions, drop_intermediate=True)
plt.plot(recall, precision)
plt.title("Precision-Recall Curve")
plt.show()

## Serialize the Model

In [None]:
serialize(model, MODELS_OUT_PATH, PARAM_MODEL_NAME)

## Save Predictions

In [None]:
data[dataset_meta.score_name] = model.predict(data_x)

data.iloc[indices_1].to_csv(out_path + "/" + "model_predicted_train.csv", index=False)
data.iloc[indices_2].to_csv(out_path + "/" + "model_predicted_test.csv", index=False)
data.iloc[indices_3].to_csv(out_path + "/" + "model_predicted_holdout_significance.csv", index=False)
data.iloc[indices_4].to_csv(out_path + "/" + "model_predicted_holdout_generalizability.csv", index=False)