![title](../assets/problem.png)

In [None]:
import json
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import plotly.express as px
from typing import Dict, List, Union, Any
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
pd.set_option('max_colwidth', 5000)

In [None]:
BASE_PATH = "/Users/seanariel/Desktop/la-maniee/data/mlops"

PATH_TO_SYNTHETIC_DATA = f"{BASE_PATH}/synthetic_data_contract.csv"
PATH_TO_EXPLODED_FEATURES = f"{BASE_PATH}/exploded_features.csv"
PATH_TO_FEATURE_STORE = f"{BASE_PATH}/feature_store.csv"
PATH_TO_DEV_TRAINING_DATA = f"{BASE_PATH}/dev_training.csv"
PATH_TO_DEV_TESTING_DATA = f"{BASE_PATH}/dev_testing.csv"
PATH_TO_AUTOML_TRAINING_DATA = f"{BASE_PATH}/automl_training.csv"
PATH_TO_PRECISION_RECALL = f"{BASE_PATH}/precision_recall.csv"
PATH_TO_OPTIMAL_MODEL = f"{BASE_PATH}/optimal_model.pickle"
PATH_TO_PRODUCTION_MODEL = f"{BASE_PATH}/production_model.pickle"
PATH_TO_TRAINING_DATA = f"{BASE_PATH}/training.csv"
PATH_TO_EXPERIMENTATION_DATA = f"{BASE_PATH}/experimentation.csv"

# Table of Content:
* [Overview](#first-bullet)
* [Feature Engineering](#second-bullet)
* [Model Development](#third-bullet)
* [Model Training](#fourth-bullet)
* [Model Serving](#fifth-bullet)
* [Model Experimentation](#sixth-bullet)

# Model Development <a class="anchor" id="third-bullet"></a>

In [None]:
import pickle
from typing import Dict, List, Union
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, roc_auc_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split

### Import feature store

In [None]:
SAMPLE = 10000
feature_store = pd.read_csv(PATH_TO_FEATURE_STORE, nrows=SAMPLE)

In [None]:
feature_store.head()

In [None]:
TARGET = "p1_has_won"
SEGMENTS = ["reward", "contract"]
COVARIATES = list(filter(lambda covariate: covariate not in [TARGET], feature_store.columns))
BASE_THRESHOLD = 0.5
RANDOM_STATE = 42

### Split the training and validation sets

In [None]:
"""
Make sure to set a holdout frame on the side.
"""
(
    covariates_training,
    covariates_testing,
    target_training,
    target_testing,
) = ...( # split the dataset between training and testing sets
    feature_store[COVARIATES],
    feature_store[TARGET],
    ..., # keep 25% of samples in the test set
    random_state=RANDOM_STATE,
)

In [None]:
print(covariates_training.shape)
print(covariates_testing.shape)
print(target_training.shape)
print(target_testing.shape)

### Preserve the segments and ids 

In [None]:
segment_covariates_training = covariates_training[SEGMENTS]
covariates_training = covariates_training.drop(SEGMENTS, axis=1)

segment_covariates_testing = covariates_testing[SEGMENTS]
covariates_testing = covariates_testing.drop(SEGMENTS, axis=1)

In [None]:
COVARIATES = list(filter(lambda covariate: covariate not in ([TARGET] + SEGMENTS), feature_store.columns))

In [None]:
print(COVARIATES)

### Get a first feeling of Bias vs Variance

In [None]:

base_hypers = {
    "n_estimators": 5000,
}

def generate_scoring(model, training_cov, training_tar, testing_cov, testing_tar) -> Dict[str, float]:
    model ... ( # train the model on the training set
        training_cov[COVARIATES].values, training_tar.values.ravel()
    )
    predictions: np.array = model ... (testing_cov[COVARIATES])[:, 1] # predict the new samples with the trained model
    predictions: np.array = np.where(predictions > BASE_THRESHOLD, 1, 0)
    return {
        "accuracy_score": ... (testing_tar, predictions), # generate the accuracy score
        "recall_score": ... (testing_tar, predictions), # generate the recall score
        "precision_score": ... (testing_tar, predictions), # generate the prediction score
    }

pipeline = {
    "BaggingClassifier": BaggingClassifier(
        **{**base_hypers, **{"n_jobs": -1}}
    ),
    "GradientBoostingClassifier": GradientBoostingClassifier(
        **{**base_hypers, **{"max_depth": 5, "min_samples_split": 5, "min_samples_leaf": 5}}
    ),
    "RandomForestClassifier": RandomForestClassifier(
        **{**base_hypers, **{"max_depth": 5, "min_samples_split": 5, "min_samples_leaf": 5, "n_jobs": -1}}
    )
}


class ModelEvaluation:
    
    def __init__(self, name, model, metrics):
        self.name: str = name
        self.model: Any = model
        self.metrics: Dict[str, float] = metrics
        
    @property
    def accuracy_score(self):
        return # look up inside the metrics Dict and get the accuracy score
    
    @property
    def recall_score(self):
        return # look up inside the metrics Dict and get the recall score
    
    @property
    def precision_score(self):
        return # look up inside the metrics Dict and get the precision score


metric_accumulators = []
for name, model in pipeline.items():
    metrics = ... ( # generate the scoring 
        model, covariates_training, target_training, covariates_testing, target_testing
    )
    metric_accumulators.append(
        ... (name, model, metrics) # instantiate a model evaluation object 
    )
    print(name, " - \n", metrics)

### Train the optimal hyperparameters

In [None]:
base_model = ... () # instantiate a new random forest classifier
hyperparameters_grid = {
    "n_estimators": [5000],
    "max_depth": [5],
    "min_samples_split": [5],
    "min_samples_leaf": [5],
}
random_search = ... ( # instantiate a randomised grid search cv object
    base_model,
    param_distributions=hyperparameters_grid,
    n_iter=1,
    scoring=None,
    n_jobs=-1,
    cv=2,
    verbose=1,
    refit=False,
)
random_search ... ( # train the randomised grid search object
    covariates_training[COVARIATES], target_training.values.reshape(-1, 1)
)
optimal_hyper_parameters, cv_results = random_search.best_params_, random_search.cv_results_

### Train the optimal model on the whole training set

In [None]:
optimal_hyper_parameters = {
    "n_estimators": 5000,
    "max_depth": 5,
    "min_samples_split": 5,
    "min_samples_leaf": 5
}
base_model = RandomForestClassifier(
    verbose=1,
    n_jobs=-1,
)
optimal_model = base_model ... (**optimal_hyper_parameters) # set the optimal HP to the base model
optimal_model.fit(
    covariates_training[COVARIATES].values, target_training.values.ravel()
)

with open(PATH_TO_OPTIMAL_MODEL, 'wb') as handle:
    pickle.dump(optimal_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

covariates_training.to_csv(PATH_TO_DEV_TRAINING_DATA, index=False)

### Generate the predictions

In [None]:
predictions = optimal_model.predict_proba(covariates_testing[COVARIATES])[:, 1]
covariates_testing["predictions"] = predictions
covariates_testing["target"] = target_testing
covariates_testing["predicted"] = covariates_testing["predictions"].apply(
    lambda x: 1 if x > BASE_THRESHOLD else 0
)

In [None]:
covariates_testing.head()

### Generate the precision and recall metrics

In [None]:
precision_arr, recall_arr, threshold_arr = precision_recall_curve(
    target_testing, predictions
)
metrics_df = pd.DataFrame(
    {
        "precision": precision_arr[1:],
        "recall": recall_arr[1:],
        "threshold": threshold_arr,
    }
)
metrics_df["threshold"] = metrics_df["threshold"].apply(lambda x: round(x, 2))
metrics_df.drop_duplicates(subset=["threshold"], keep="first", inplace=True)
metrics_df.to_csv(PATH_TO_PRECISION_RECALL, index=False)

In [None]:
metrics_df.head()

In [None]:
print("ROC_AUC", roc_auc_score(target_testing, predictions))
print("Accuracy", accuracy_score(target_testing, covariates_testing.predicted))
fig, ax = plt.subplots()
ax.plot(metrics_df["recall"].values, metrics_df["precision"].values, color='purple')
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
plt.show()

### Interpret the model

In [None]:
importances = optimal_model.feature_importances_
indices = np.argsort(importances)[-15:]
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [COVARIATES[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Study the errors

In [None]:
metrics_df[metrics_df.threshold == BASE_THRESHOLD]

In [None]:
metrics_df[metrics_df.precision > 0.9].head(1)

In [None]:
BUSINESS_THRESDHOLD = 0.65

covariates_testing["business_predicted"] = covariates_testing["predictions"].apply(
    lambda x: 1 if x > BUSINESS_THRESDHOLD else 0
)
covariates_testing = pd.concat([covariates_testing, segment_covariates_testing], axis=1)
covariates_testing.to_csv(PATH_TO_DEV_TESTING_DATA, index=False)

In [None]:
errors = covariates_testing[covariates_testing.target != covariates_testing.predicted]

In [None]:
covariates_testing.describe()

In [None]:
covariates_testing.contract.value_counts(normalize=True)

In [None]:
errors.describe()

In [None]:
errors.contract.value_counts(normalize=True)

### [Optional] Assignment 3 - Google Cloud Engine Lab

Let's head over to Google Cloud Engine to set up a larger VM.
This will allow us to run larger development pipelines such as:

- Large scale grid search (Bayesian, Random or Grid)
- Tuning over subset of models (Boosters, Trees and NN)

First, make sure to set up you <a> Google Cloud Storage </a> bucket that we will use throughout this course.

Then, follow this lab to set up your <a> Google Cloud VM </a> server and launch a first E2E model development run.

### [Optional] Assignment 4 - Google AutoML (Vertex) Lab

Let's head over to <a> Google Cloud Vertex AI </a> to launch automated training pipelines at scale.

We will essentially replicate the workflow we have set up here - but most of it will be abstracted from us through this complete, no-code solution.

In [None]:
feature_store = pd.read_csv(PATH_TO_FEATURE_STORE)
feature_store[COVARIATES + [TARGET]].to_csv(PATH_TO_AUTOML_TRAINING_DATA, index=False)

#### Credit

Note:
This content has been developed by Sean Ariel for educational purposes. 
It is a practical training that cannot be copied, reproduced, distributed without the explicit consent from the author. © Sean Ariel