## Model Training and Evaluation

In [None]:
import pandas as pd
import numpy as np
import mlflow
import tempfile
import IPython.display
import pathlib
import pickle
import time

import matplotlib.pyplot as plt

import sklearn
import sklearn.model_selection
import sklearn.impute
import sklearn.ensemble
import sklearn.pipeline
import sklearn.tree
import sklearn.metrics
import sklearn.neural_network
import sklearn.preprocessing
import sklearn.compose
from sklearn.feature_selection import SelectKBest, chi2

import ml_colon
import ml_colon.data_preparation

In [None]:
trained_models = []

### Retrieving Data

We have implemented the data cleaning the `ml_colon.data_preparation` module and with that retrieve the "cleaned" DataFrame. By "cleaned" we mean that we have filtered out all rows that we want to exclude from training. 

In [None]:
df = ml_colon.data_preparation.get_clean_df_from_csv()

print(f"Loaded data set with {len(df)} rows.")

### Splitting Train / Test set

Next we split the data set into the train / test set.

In [None]:
test_size = 0.2 # 20% of rows
possible_features = [c for c in df.columns if c != ml_colon.TARGET_VARIABLE]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    df[possible_features],
    df[ml_colon.TARGET_VARIABLE],
    test_size=test_size,
    random_state=ml_colon.SEED,
)

print(f"Train set: {len(y_train)} rows | Test set: {len(y_test)} rows\n")
print(f"Train set grouped by relevant: \n{y_train.value_counts()}\n")
print(f"Test set grouped by relevant: \n{y_test.value_counts()}")

#### Upsampling

Where are dealing with an imbalanced data set regarding the target variable. Any estimator can reach 82% accuracy by simply always predicting 1.
Reminder that:
\begin{equation}
accuracy = \dfrac{TP + TN}{P + N} 
\end{equation}

Therefore we need to upsample the rows in our dataframe with relevant = 0

In [None]:
def upsample(df: pd.DataFrame, n: int) -> pd.DataFrame:
    """Upsample the DataFrame with respect to the target variable "relevant".

    Parameters
    ----------
    df : pd.DataFrame
        [description]
    n : int
        number of rows in resulting DataFrame

    Returns
    -------
    pd.DataFrame
        the upsampled DataFrame
    """
    weight_relevant_1 = 0.5 / len(df[df.relevant == 1] )
    weight_relevant_0 = 0.5 / len(df[df.relevant == 0] )

    relevant_weighted = df.relevant.replace({
        0: weight_relevant_0,
        1: weight_relevant_1
    })
    return df.sample(
        n=n,
        replace=True,
        weights=relevant_weighted
    )

In [None]:
train_df = X_train
train_df[ml_colon.TARGET_VARIABLE] = y_train

train_df = upsample(train_df, n=len(X_train))

X_train = train_df[possible_features]
y_train = train_df[ml_colon.TARGET_VARIABLE]

print(f"Train set: {len(y_train)} rows")
print(f"Train set grouped by relevant: \n{y_train.value_counts()}\n")

### Feature Selection

To speed up the model training and for the beginning have "simpler" and potentially more explainable models we limit the numbers of features / columns we use.

In [None]:
selection = SelectKBest(score_func = chi2, k = 10).fit(X_train, y_train).get_support()
features = [i for (i, boolean) in zip(possible_features, selection) if boolean]

X_train_limited_features = X_train[features].copy()
X_test_limited_features = X_test[features].copy()

In [None]:
print(f"Columns to be used as features: {features}")

As already described in the `data_exploration.ipynb` notebook these are the 10 feature that will hopefully yield the best results.

### Classifiers and Parameters Definition
We will try different models to get the best outcome with different hyperparameters based on the accuracy of the model. The different classifiers used are: 

- K-Nearest Neighbor
- Random Forest 
- Multi-Layer Perceptron
- Gradient Boosting

With this selection, there is a variety of complexity of models used. There is one simple model, namely the K-Nearest Neighbor algorithm. The Random Forest algorithm is an ensemble method and a more powerful method than K-Nearest Neighbors. Nowadays, there is a big hype around neural networks and their power of finding interesting patterns in data, therefore the Multi-Layer Perceptron is included. Lastly, we used also a gradient boosting classifier which is as well an ensemble method and similar to Random Forest.

Each of these algorithms will be tested with different hyperparameters using grid search. The different values of the parameters are described below.

In [None]:
implemented_classifiers = {
    "k_neighbor": {
        "classifier": sklearn.neighbors.KNeighborsClassifier(),
        "param_grid": [{"classifier__n_neighbors": [3, 5, 11, 15, 20]}],
    },
    "random_forest": {
        "classifier": sklearn.ensemble.RandomForestClassifier(max_features=2),
        "param_grid": [{"classifier__max_depth": [4, 8, 10, 12, 14, 16, 18, 20], "classifier__n_estimators": [10, 15, 20, 25, 30, 35, 40, 45, 50, 55]}],
    },
    "multilayer_perceptron": {
        "classifier": sklearn.neural_network.MLPClassifier(),
        "param_grid": [
            {"classifier__alpha": [0.001, 0.01, 0.1, 0.5], "classifier__activation": ["identity", "relu"]}
        ],
    },
    "gradient_boosting": {
        "classifier": sklearn.ensemble.GradientBoostingClassifier(),
        "param_grid": [
            {"classifier__learning_rate": [ 0.05, 0.1, 0.2], "classifier__n_estimators": [100, 150, 200]}
        ]

    }
}

#### Model Pipeline

Before a machine learning model can be trained some data transformations need to be done. For that we use a `sklearn.pipeline.Pipeline` to chain the data transformations such as imputing missing values or scaling.

In [None]:
def build_model_pipeline(classifier: str, X_train: pd.DataFrame) -> sklearn.pipeline.Pipeline: 
    index_continous_columns = X_train.columns.get_indexer(
        X_train.select_dtypes(include=np.float).columns
    )

    pipeline = sklearn.pipeline.Pipeline(
        steps=[
            (
                "imputer", sklearn.impute.SimpleImputer(missing_values=np.nan, strategy="mean") # this is not strictly necessary as we throw out all rows with missing values in the data cleaning
            ), 
            (
                "scaler", sklearn.compose.make_column_transformer(
                    (
                        sklearn.preprocessing.MinMaxScaler(feature_range=(0,1)),
                        index_continous_columns
                    ),
                    remainder="passthrough"
                )
            ),
            (
                "classifier",
                implemented_classifiers[classifier]["classifier"],
            ),
        ]
    )

    return pipeline

#### Grid Search

In [None]:
def build_grid_search(pipeline: sklearn.pipeline.Pipeline) -> sklearn.model_selection.GridSearchCV:
    classifier_cv = sklearn.model_selection.GridSearchCV(
        estimator=pipeline,
        param_grid=implemented_classifiers[classifier]["param_grid"],
        scoring=score_metric,
    )
    return classifier_cv

### Model Training and Evaluation 

We use [mlflow](https://www.mlflow.org/) library to track each model training and save the resulting plots.


In [None]:
mlflow.set_tracking_uri("file://" + str(ml_colon.OUTPUT_DIR / "mlruns"))
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri)) # where the outputs are stored


In [None]:
def run_model(
    classifier:str,
    X_train: pd.DataFrame, 
    y_train: pd.Series, 
    X_test: pd.DataFrame, 
    y_test: pd.Series
    ):
    start_time = time.time()

    pipeline = build_model_pipeline(classifier, X_train)
    classifier_cv = build_grid_search(pipeline)

    run_id = None
    with mlflow.start_run() as run:

        run_id = run.info.run_id
        experiment_id = run.info.experiment_id
        print(f"Starting MlFlow run {run_id} for experiment {experiment_id}", "\n")

        mlflow.log_param("classifier", classifier)
        mlflow.log_param("number_of_features", len(X_train.columns))
        mlflow.log_param("number_of_rows_train", len(X_train))
        mlflow.log_param("number_of_rows_test", len(X_test))

        # The training of the model
        print(f"Training a {classifier} classifier")
        classifier_cv.fit(X_train, y_train)

        # Best parameters
        print("Best parameters:")
        print(classifier_cv.best_params_, "\n")

        mlflow.log_params(classifier_cv.best_params_)
        mlflow.log_metric(score_metric, classifier_cv.best_score_)

        # Evaluation of results
        print("Classification Report")
        y_pred = classifier_cv.predict(X_test)
        y_pred_proba = classifier_cv.predict_proba(X_test)
        report = sklearn.metrics.classification_report(y_test, y_pred, output_dict=True)

        IPython.display.display(pd.DataFrame(report).T)

        for label in ["0.0", "1.0"]:
            for k, v in report[label].items():
                mlflow.log_metric(f"label_{label}_{k}", v)


        print("Plotting Confusion Matrix")
        cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
        cm_fig = sklearn.metrics.ConfusionMatrixDisplay(cm).plot()
        plt.show()

        print("Plotting ROC curve")
        roc_fig = sklearn.metrics.plot_roc_curve(
            classifier_cv, X_test, y_test, name=f"ROC Curve {classifier}"
        )
        plt.show()

        print("Writing Artifacts to MlFlow")
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_dir_path = pathlib.Path(tmp_dir)
            assert tmp_dir_path.exists()

            # Saving plots
            cm_fig.figure_.savefig(str(tmp_dir_path / "confusion_matrix.png"))
            roc_fig.figure_.savefig(str(tmp_dir_path / "roc.png"))

            # Saving Model
            with open(tmp_dir_path / "model.pkl", "wb") as model_file:
                pickle.dump(classifier_cv, model_file)           

            mlflow.log_artifacts(str(tmp_dir_path))

        # store trained model in list for later evaluation
        trained_models.append({
            "name": classifier,
            "classifier": classifier_cv,
            "features": list(X_train.columns),
            "X_test": X_test,
            "y_test": y_test
            })

    run_time = time.time() - start_time
    mlflow.log_metric("run_time_in_seconds", run_time)
    print(f"Finished in {run_time} seconds")

    return run_id, experiment_id


#### It's time for training!

We want to train each classifier. One time with the preselected features and one time with all columns as input

In [None]:
score_metric = "accuracy"
assert score_metric in ml_colon.SCORE_METRICS

run_id = None
mlflow.set_experiment("ml_colon_limited_features")
for classifier, _ in implemented_classifiers.items():
    run_id, experiment_id = run_model(classifier, X_train_limited_features, y_train, X_test_limited_features, y_test)
    print("-------------------------------------------------------")

    



In [None]:
run_id = None
mlflow.set_experiment("ml_colon_all_features")
for classifier, _ in implemented_classifiers.items():
    run_id, experiment_id = run_model(classifier, X_train, y_train, X_test, y_test)
    print("-------------------------------------------------------")



### Compare Different Models

Once we have a few models trained we can compare their Receiver Operator Curve (ROC) to see how well perform.



In [None]:
fig, ax = plt.subplots(figsize=(15,12))

for model in trained_models:

    sklearn.metrics.plot_roc_curve(
    model['classifier'], model['X_test'], model['y_test'], name=f"{model['classifier'].best_estimator_.steps[-1][1]}_#features_{len(model['features'])}", ax=ax
    )
    
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

 
#### MlFlow Dashboard

As explained in the `README.md` you can now run the MlFlow dashboard by executing
```
cd output/
mlflow ui
```
and then accessing the dashboard through your browser

### Loading a previously trained model

To load a previously trained model all one needs is to specify the run_id in which the model was trained.


In [None]:
def load_model_from_run_id(experiment_id, run_id):
    model_path = ml_colon.OUTPUT_DIR / "mlruns" / str(experiment_id) / run_id / "artifacts" / "model.pkl" 
    if not model_path.exists():
        print(f"Could not find pickled model under {model_path}")
        raise FileNotFoundError
    with open(model_path, "rb") as model_file:
        return pickle.load(model_file)

In [None]:

reloaded_model = load_model_from_run_id(experiment_id, run_id)

print(reloaded_model.estimator)