# 2.1-agifford-TemplateModelSearch
This notebook tests and templates the code necessary to build a model to predict `label_group` given frequency features of the various measurement data.

In [None]:
from dotenv import find_dotenv, load_dotenv
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
import gc
from datetime import datetime

import pandas as pd
import os
load_dotenv(find_dotenv())

In [None]:
FEATURE_STORE_URI = os.getenv("FEATURE_STORE_URI", "localhost:5432")
FEATURE_STORE_PW = os.getenv("FEATURE_STORE_PW")
FEATURIZE_ID = os.getenv("FEATURIZE_ID")

MLFLOW_DB_URI = os.getenv("MLFLOW_DB_URI", "localhost:5000")
MLFLOW_DB_PW = os.getenv("MLFLOW_DB_PW")

In [None]:
mlflow.set_tracking_uri(f"http://{MLFLOW_DB_URI}")
client = MlflowClient(f"http://{MLFLOW_DB_URI}")

Get a sample of the training and validation datasets.

In [None]:
DATABASE_URI = f"postgresql+psycopg2://postgres:{FEATURE_STORE_PW}@{FEATURE_STORE_URI}/feature_store"
engine = sa.create_engine(
    DATABASE_URI, 
    executemany_mode='values',
    executemany_values_page_size=10000, 
    executemany_batch_page_size=500
)

metadata = sa.schema.MetaData(bind=engine)
table = sa.Table("naive_frequency_features", metadata, autoload=True)

Session = sessionmaker(bind=engine)
with Session() as session:
    results = (
        session
        .query(table)
        .filter(
            sa.and_(
                table.c.featurize_id==FEATURIZE_ID,
                table.c.dataset_group=="train"
            )
        )
        .limit(800)
    )


train_debug_df = pd.read_sql(
    results.statement,
    con=engine,
    parse_dates=["added_datetime"]
)

In [None]:
with Session() as session:
    results = (
        session
        .query(table)
        .filter(
            sa.and_(
                table.c.featurize_id==FEATURIZE_ID,
                table.c.dataset_group=="validation"
            )
        )
        .limit(200)
    )


val_debug_df = pd.read_sql(
    results.statement,
    con=engine,
    parse_dates=["added_datetime"]
)

Need this function to manually convert all-string parameters logged in mlflow to true data types for sklearn models.

In [None]:
def convert_string_params(params):
    converted_params = {}

    for key, val in params.items():
        # any parameters that specify the files used in the dataset, ignore for model
        # purposes
        if "file" in key:
            continue
        
        # first, check for special types
        if val == "None":
            converted_params[key] = None
        if val == "True" or val == "False":
            converted_params[key] = bool(val)
        
        # next, test for float or int
        # since int is more restrictive, place last. that way if int works, it overwrites
        # float, else it will fail on int and keep float. and will fail if val is string
        try:
            converted_params[key] = float(val)
            converted_params[key] = int(val)
        except ValueError:
            pass

        # # followed by float (fails on string or special)
        # try:
            
        # except ValueError:
        #     pass

        # finally, if key not in converted params, val must be string, so leave as is
        if key not in converted_params.keys():
            converted_params[key] = val

    return converted_params


Set up the data for the experiment

In [None]:
experiment_name = "exercise_prediction_debug_2"
mlflow.set_experiment(experiment_name)
experiment_id = int(dict(mlflow.get_experiment_by_name(experiment_name))["experiment_id"])
X_train = train_debug_df.drop(columns=[
    'naive_frequency_features_id', 
    'featurize_id', 
    'file', 
    'dataset_group',
    'added_datetime', 
    'window_size', 
    't_index', 
    'label', 
    'label_group'
])
# not sure what the issue is, but sklearn was throwing a future warning about the 
# feature names not all being strings, this fixes the issue...
X_train.columns = [str(column) for column in X_train.columns]
y_train = train_debug_df["label_group"]

X_val = val_debug_df.drop(columns=[
    'naive_frequency_features_id', 
    'featurize_id', 
    'file', 
    'dataset_group',
    'added_datetime', 
    'window_size', 
    't_index', 
    'label', 
    'label_group'
])
X_val.columns = [str(column) for column in X_val.columns]
y_val = val_debug_df["label_group"]


In [None]:
train_debug_df.file.unique()

First, here is a code snippet for a simple run of a series of classifiers.

In [None]:
mlflow.sklearn.autolog()

for model_class in (RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier):
    with mlflow.start_run(run_name="basic-default-fits"):
        mlflow.set_tag("developer", "adam gifford")
        mlflow.set_tag("model", model_class.__name__)
        mlflow.log_params({
            f"file{n}": file for n, file in enumerate(train_debug_df.file.unique())
        })

        mlmodel = model_class(random_state=42)
        mlmodel.fit(X_train, y_train)

        acc = mlmodel.score(X_val, y_val)
        mlflow.log_metric("accuracy", acc)

Here are the results of the runs, with GradientBoostingClassifier the clear winner.

In [None]:
runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.accuracy DESC"],
)
for run in runs:
    print(f"run id: {run.info.run_id}, model: {run.data.tags['model']}, accuracy: {run.data.metrics['accuracy']:.4f}")

Now, we try `GradientBoostingClassifier` with a cross-validated grid search to try to improve the fit results. Here is a code snippet for that.

In [None]:
gbc = GradientBoostingClassifier(random_state=42, n_iter_no_change=50, tol=1e-3)
parameters = {
    'learning_rate': [0.01, 0.1, 0.25], 
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 0.9, 1],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_depth': [3, 4, 5],
    # 'min_weight_fraction_leaf': [0, 0.25, 0.5],
    # 'min_impurity_decrease': [0, 0.01, 0.1],
    # 'max_features': ('sqrt', 'log2', None),
    # 'max_leaf_nodes': [None, 5, 10],
    # 'ccp_alpha': [0, 0.1, 1]
}
clf = GridSearchCV(gbc, parameters, n_jobs=10)

with mlflow.start_run(run_name="basic-gridshearch-fit"):
    mlflow.set_tag("developer", "adam gifford")
    mlflow.set_tag("model", gbc.__class__.__name__)
    mlflow.log_params({
        f"file{n}": file for n, file in enumerate(train_debug_df.file.unique())
    })
    clf.fit(X_train, y_train)
    acc = clf.score(X_val, y_val)
    mlflow.log_metric("accuracy", acc)

This `GridSearchCV` took 82 minutes to complete on only 800 rows of data. Will need to be smarter about fitting on the whole dataset if using `GridSearchCV`. Also, the run_name and other manual tags and params options for mlflow run do not seem to work for the resulting best 5 models logged with grid search, only the parent run with the best model...

In [None]:
runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="tags.`mlflow.parentRunId` = '05b06f98c9d34145a78f2bd57cc09e6d'",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.accuracy DESC"],
)
for run in runs:
    print(f"run id: {run.info.run_id}")

Manually update the 5 best runs with the desired params, tags and metrics, and log models.

In [None]:
mlflow.sklearn.autolog(disable=True)
for run in runs:
    run_id = run.info.run_id
    with mlflow.start_run(run_id=run_id):
        mlflow.set_tag("developer", "adam gifford")
        mlflow.set_tag("model", "GradientBoostingClassifier")
        mlflow.log_params({
            f"file{n}": file for n, file in enumerate(train_debug_df.file.unique())
        })
        params = convert_string_params(run.data.params)
        clf = GradientBoostingClassifier(**params)
        clf.fit(X_train, y_train)
        mlflow.sklearn.log_model(clf, artifact_path="artifacts")

        acc = clf.score(X_val, y_val)
        mlflow.log_metric("accuracy", acc)

Now, we try using HyperOpt to improve the fit.

In [None]:
def objective(params):
    with mlflow.start_run(run_name="basic-hyperopt-fit-child", nested=True):
        mlflow.set_tag("developer", "adam gifford")
        mlflow.set_tag("model", "GradientBoostingClassifier")
        params.update({
            "random_state": 42, 
            "n_iter_no_change": 50, 
            "tol": 1e-3, 
        })
        clf = GradientBoostingClassifier(**params)
        params.update({
            f"file{n}": file for n, file in enumerate(train_debug_df.file.unique())
        })
        mlflow.log_params(params)
        clf.fit(X_train, y_train)
        acc = clf.score(X_val, y_val)
        mlflow.log_metric("accuracy", acc)
        del clf
        gc.collect()

    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 100)),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 6, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 5, 1)),
}

with mlflow.start_run(run_name="basic-hyperopt-fit"):
    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=20,
        trials=Trials()
    )

Hyperopt is a LOT faster, and it achieved a slightly better accuracy, so I will go with this on the full dataset. Below I'm performing the same manual updates to the runs to log tags, params, metrics, and models.

In [None]:
runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="tags.`mlflow.parentRunId` = '543846fbd0df448a946e0a5b74aa01a1'",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.accuracy DESC"],
)
for run in runs:
    print(f"run id: {run.info.run_id}")

In [None]:
for run in runs:
    run_id = run.info.run_id
    with mlflow.start_run(run_id=run_id):
        mlflow.set_tag("developer", "adam gifford")
        mlflow.set_tag("model", "GradientBoostingClassifier")
        mlflow.log_params({
            f"file{n}": file for n, file in enumerate(train_debug_df.file.unique())
        })
        params = convert_string_params(run.data.params)
        clf = GradientBoostingClassifier(**params)
        clf.fit(X_train, y_train)
        mlflow.sklearn.log_model(clf, artifact_path="artifacts")

        acc = clf.score(X_val, y_val)
        mlflow.log_metric("accuracy", acc)

Finally, I will template out code to register models, and transition models between stages.

In [None]:
# taking just the top 3 and then reversing the order, because I will model comparing newer 
# more accurate models with previous versions and transitioning
runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=3,
    order_by=["metrics.accuracy DESC"],
)
runs = [run for run in runs]
runs = [runs[-2], runs[-1], runs[0]]

Take "first" model and register.

In [None]:
model_name = experiment_name
first_model = runs[0]
run_id = first_model.info.run_id
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name=model_name)

Now transition registered model to staging, then to production.

In [None]:
def transition_model_and_log(model_version, new_stage, archive_existing_versions=False):
    client.transition_model_version_stage(
        name=model_name,
        version=model_version,
        stage=new_stage,
        archive_existing_versions=archive_existing_versions
    )
    date = datetime.today().date()
    client.update_model_version(
        name=model_name,
        version=model_version,
        description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
    )

In [None]:
model_version = 1
new_stages = ["Staging", "Production"]
for stage in new_stages:
    transition_model_and_log(model_version, stage)

In [None]:
latest_versions = client.get_latest_versions(name=model_name, stages=["Production"])
latest_model_run = client.get_run(run_id=latest_versions[0].run_id)
previous_best_acc = latest_model_run.data.metrics["accuracy"]

In [None]:
new_runs = runs[1:]
for run in new_runs:
    run_id = run.info.run_id
    new_acc = run.data.metrics["accuracy"]

    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri=model_uri, name=model_name)
    
    latest_versions = client.get_latest_versions(name=model_name)
    current_version = latest_versions[-1].version
    transition_model_and_log(current_version, "Staging", archive_existing_versions=False)

    if new_acc > previous_best_acc:
        transition_model_and_log(current_version, "Production", archive_existing_versions=True)
        previous_best_acc = new_acc