## Model Training and Evaluation

In [None]:
import pandas as pd
import numpy as np
import mlflow
import tempfile
import IPython.display
import pathlib

import sklearn
import sklearn.model_selection
import sklearn.impute
import sklearn.ensemble
import sklearn.pipeline
import sklearn.tree
import sklearn.metrics
import sklearn.neural_network

import ml_colon
import ml_colon.data_preparation
import ml_colon.model

### Retrieving Data

We have implemented the data cleaning the `ml_colon.data_preparation` module and with that retrieve the "cleaned" DataFrame. By "cleaned" we mean that we have filtered out all rows that we want to exclude from training. No further rows will be excluded from here onwards. 

In [None]:
df = ml_colon.data_preparation.get_clean_df_from_csv()

print(f"Loaded data set with {len(df)} rows.")

### Splitting Train / Test set

Next we split the data set into the train / test set.

In [None]:
test_size = 0.2 # 20% of rows
features = [c for c in df.columns if c != ml_colon.TARGET_VARIABLE]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    df[features],
    df[ml_colon.TARGET_VARIABLE],
    test_size=test_size,
    random_state=ml_colon.SEED,
)

print(f"Train set: {len(y_train)} rows | Test set: {len(y_test)} rows")

### Classifiers and Parameters Definition
We will try different models to get the best outcome with different hyperparameters based on the accuracy of the model. The different classifiers used are: 

- K-Nearest Neighbor
- Random Forest 
- Multi-Layer Perceptron

With this selection, there is a variety of complexity of models used. There is one simple model, namely the K-Nearest Neighbor algorithm. The Random Forest algorithm is an ensemble method and a more powerful method than K-Nearest Neighbors. Nowadays, there is a big hype around neural networks and their power of finding interesting patterns in data, therefore the Multi-Layer Perceptron is included. 

Each of these algorithms will be tested with different hyperparameters using grid search. The different values of the parameters are described below.

In [None]:
score_metric = "accuracy"
assert score_metric in ml_colon.SCORE_METRICS

In [None]:
implemented_classifiers = {
    "k_neighbor": {
        "classifier": sklearn.neighbors.KNeighborsClassifier(),
        "param_grid": [{"classifier__n_neighbors": [5, 11, 15]}],
    },
    "random_forest": {
        "classifier": sklearn.ensemble.RandomForestClassifier(max_features=2),
        "param_grid": [{"classifier__max_depth": [4, 8, 10], "classifier__n_estimators": [10, 15, 20]}],
    },
    "multilayer_perceptron": {
        "classifier": sklearn.neural_network.MLPClassifier(),
        "param_grid": [
            {"classifier__alpha": [0.001, 0.01, 0.1, 0.5], "classifier__activation": ["identity", "relu"]}
        ],
    },
}

classifier = "k_neighbor"
assert classifier in implemented_classifiers

#### Model Pipeline

Before a machine learning model can be trained some data transformations need to be done. For that we use a `sklearn.pipeline.Pipeline` to chain the data transformations such as imputing missing values or scaling.

In [None]:
pipeline = sklearn.pipeline.Pipeline(
    steps=[
        (
            "impute_nan",
            sklearn.impute.SimpleImputer(missing_values=np.nan, strategy="mean"),
        ),
        (
            "classifier",
            implemented_classifiers[classifier]["classifier"],
        ),  # easy to extend with scalers, etc.
    ]
)

#### Grid Search

In [None]:
classifier_cv = sklearn.model_selection.GridSearchCV(
    estimator=pipeline,
    param_grid=implemented_classifiers[classifier]["param_grid"],
    scoring=score_metric,
)

### Model Training and Evaluation 

We use [mlflow](https://www.mlflow.org/) library to track each model training and save the resulting plots.


In [None]:
mlflow.set_tracking_uri("file://" + str(ml_colon.OUTPUT_DIR / "mlruns"))
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri)) # where the outputs are stored

mlflow.set_experiment("ml_colon")

In [None]:
with mlflow.start_run() as run:

    print(f"Starting MlFlow run: {run.info.run_id}", "\n")
    mlflow.log_param("classifier", classifier)

    # The training of the model
    print(f"Training a {classifier} classifier")
    classifier_cv.fit(X_train, y_train)

    # Best parameters
    print("Best parameters:")
    print(classifier_cv.best_params_, "\n")

    mlflow.log_params(classifier_cv.best_params_)
    mlflow.log_metric(score_metric, classifier_cv.best_score_)

    # Evaluation of results
    print("Classification Report")
    y_pred = classifier_cv.predict(X_test)
    y_pred_proba = classifier_cv.predict_proba(X_test)
    report = sklearn.metrics.classification_report(y_test, y_pred, output_dict=True)

    IPython.display.display(pd.DataFrame(report).T)

    for label in ["0.0", "1.0"]:
        for k, v in report[label].items():
            mlflow.log_metric(f"label_{label}_{k}", v)


    print("Plotting Confusion Matrix")
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
    cm_fig = sklearn.metrics.ConfusionMatrixDisplay(cm).plot()

    print("Plotting ROC curve")
    roc_fig = sklearn.metrics.plot_roc_curve(
        classifier_cv, X_test, y_test, name="ROC Curve"
    )

    print("Writing Artifacts to MlFlow")
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_dir_path = pathlib.Path(tmp_dir)
        assert tmp_dir_path.exists()

        cm_fig.figure_.savefig(str(tmp_dir_path / "confusion_matrix.png"))
        roc_fig.figure_.savefig(str(tmp_dir_path / "roc.png"))

        mlflow.log_artifacts(str(tmp_dir_path), artifact_path="plots")


### MlFlow Dashboard

As explained in the `README.md` you can now run the MlFlow dashboard by executing
```
cd output/
mlflow ui
```
and then accessing the dashboard through your browser `http://127.0.0.1:5000`