In [1]:
import os
import urllib
import tarfile

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import mlflow
import mlflow.sklearn

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [3]:
housing = pd.read_csv(os.path.join(HOUSING_PATH, 'housing.csv'))

In [4]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [5]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [6]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing else to do
        
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [7]:
num_pipeline = Pipeline([
    ("Imputer", SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler', StandardScaler()),
])

In [8]:
cat_attrs = ["ocean_proximity"]
num_attrs = list(set(housing.columns) - set(cat_attrs))
data_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", OneHotEncoder(), cat_attrs)
])

In [9]:
housing_prepared = data_pipeline.fit_transform(housing)

In [10]:
housing_test = strat_test_set.drop("median_house_value", axis=1)
housing_test_labels = strat_test_set["median_house_value"].copy()
housing_test_prepared = data_pipeline.transform(housing_test)

In [11]:
remote_server_uri = "http://0.0.0.0:5000"
mlflow.set_tracking_uri(remote_server_uri)

In [12]:
exp_name = "housing_price"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='mlruns/2', experiment_id='2', lifecycle_stage='active', name='housing_price', tags={}>

In [18]:
def calculate_metrics(y_true, y_hat):
    mape = mean_absolute_percentage_error(y_true, y_hat)
    rmse = np.sqrt(mean_squared_error(y_true, y_hat))
    r2 = r2_score(y_true, y_hat)
    return mape, rmse, r2

In [21]:
def spawn_run(model, run_name, params):
    with mlflow.start_run(run_name=run_name, nested=True):
        model.fit(housing_prepared, housing_labels)
        housing_test_predictions = model.predict(housing_test_prepared)
        mape, rmse, r2 = calculate_metrics(housing_test_labels, housing_test_predictions)
        
        print(f'MAPE: {mape}')
        print(f'RMSE: {rmse}')
        print(f'R2: {r2}')

        mlflow.log_params(params)
        mlflow.log_metrics({"MAPE": mape, "RMSE": rmse, "R2": r2})
        mlflow.log_artifact(os.path.join(HOUSING_PATH, 'housing.csv'))
        mlflow.sklearn.log_model(model, run_name)
        mlflow.sklearn.log_model(data_pipeline, "Data pipeline")

In [22]:
with mlflow.start_run(run_name="housing_price"):
    svr = SVR(kernel='rbf', gamma=0.1, C=1e5)
    spawn_run(svr, "SVR", {"kernel": "rbf", "gamma":0.1, "C":10000})

    rf = RandomForestRegressor(max_features=6, n_estimators=30)
    spawn_run(rf, "Random Forest", {"max_features": 6, "n_estimators": 30})

MAPE: 0.1954034400932563
RMSE: 53954.69452548624
R2: 0.7766381091219969
MAPE: 0.1822768866990455
RMSE: 47729.2018148898
R2: 0.8252090492418814
