![title](../assets/problem.png)

In [1]:
import json
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import plotly.express as px
from typing import Dict, List, Union, Any
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
pd.set_option('max_colwidth', 5000)

In [2]:
BASE_PATH = "/Users/zhaocong/Desktop/belote-mlops-course/data"

PATH_TO_SYNTHETIC_DATA = f"{BASE_PATH}/synthetic_data_contract.csv"
PATH_TO_EXPLODED_FEATURES = f"{BASE_PATH}/exploded_features.csv"
PATH_TO_FEATURE_STORE = f"{BASE_PATH}/feature_store.csv"
PATH_TO_DEV_TRAINING_DATA = f"{BASE_PATH}/dev_training.csv"
PATH_TO_DEV_TESTING_DATA = f"{BASE_PATH}/dev_testing.csv"
PATH_TO_AUTOML_TRAINING_DATA = f"{BASE_PATH}/automl_training.csv"
PATH_TO_PRECISION_RECALL = f"{BASE_PATH}/precision_recall.csv"
PATH_TO_OPTIMAL_MODEL = f"{BASE_PATH}/optimal_model.pickle"
PATH_TO_PRODUCTION_MODEL = f"{BASE_PATH}/production_model.pickle"
PATH_TO_TRAINING_DATA = f"{BASE_PATH}/training.csv"
PATH_TO_EXPERIMENTATION_DATA = f"{BASE_PATH}/experimentation.csv"

# Table of Content:
* [Overview](#first-bullet)
* [Feature Engineering](#second-bullet)
* [Model Development](#third-bullet)
* [Model Training](#fourth-bullet)
* [Model Serving](#fifth-bullet)
* [Model Experimentation](#sixth-bullet)

# Model Training <a class="anchor" id="fourth-bullet"></a>

In [4]:
import pickle
import pandas as pd

In [7]:
SAMPLE = 10**10
feature_store = pd.read_csv(PATH_TO_FEATURE_STORE, nrows=SAMPLE)

In [None]:
with open(PATH_TO_OPTIMAL_MODEL, 'rb') as handle:
    optimal_model = pickle.load(handle)

In [None]:
POTENTIAL_TARGETS = ["reward", "p1_has_won"]
TARGET = "p1_has_won"
SEGMENTS = ["reward", "contract"]
COVARIATES = list(filter(lambda covariate: covariate not in (POTENTIAL_TARGETS + SEGMENTS), feature_store.columns))

In [None]:
training_df = feature_store[COVARIATES + [TARGET]]

In [None]:
optimal_model.fit(
    training_df[COVARIATES], training_df[TARGET].values.reshape(-1, 1)
)
training_df.to_csv(PATH_TO_TRAINING_DATA, index=False)

In [None]:
with open(PATH_TO_PRODUCTION_MODEL, 'wb') as handle:
    pickle.dump(optimal_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

### [Optional] Assignment 5 - MLFlow Lab

Let's head over to <a> MLFLOW </a> to manage the full lifecycle our our ML Solution and handle its key artificats.

This will enable collaboration between different Data Scientists, and it will ensure we implement some of the recommended MLOps best practices, such as thorought governance of the system



### [Optional]  Assignment 6 - SPARK

There's only so much we can do with Vertical Scaling. Running this training code on our GVM is already much more powerful than training it locally, as this remote server's cores and memory are significantly larger. But we won't be able to train our model on much more sample within a reasonable 

Time to shift our mindset and get in the world of Distributed Computing (through Horizontal Scaling). We can achieve true scale here, and see what Big Data Processing really means !

Let's head over to <a> Spark on Dataproc </a>, one of the most promising technologies developed over the last decade. It's computational engine and large-scale models should allow us build quickly on top of our Feature Store.

#### Credit

Note:
This content has been developed by Sean Ariel for educational purposes. 
It is a practical training that cannot be copied, reproduced, distributed without the explicit consent from the author. © Sean Ariel