[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/arangoml/arangopipe/blob/mlspec_ap_example/examples/MLSpec_AP_Example.ipynb)]


In [None]:
!pip install python-arango
!pip install arangopipe==0.0.6.9.3
!pip install pandas PyYAML==5.1.1 sklearn2
!pip install jsonpickle
!git clone -b master https://github.com/mlspec/mlspec-lib.git
import os
os.chdir('mlspec-lib/')
!pip install -r requirements.txt

In [None]:
import sys
sys.path.append('..')

import yaml

from mlspeclib.mlobject import MLObject
from mlspeclib.mlschemaenums import MLSchemaTypes
from mlspeclib.mlschema import MLSchema
import os
import uuid
import datetime
from pathlib import Path

In [None]:
from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe
from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin
from arangopipe.arangopipe_storage.arangopipe_config import ArangoPipeConfig
from arangopipe.arangopipe_storage.managed_service_conn_parameters import ManagedServiceConnParam
mdb_config = ArangoPipeConfig()
msc = ManagedServiceConnParam()
conn_params = { msc.DB_SERVICE_HOST : "arangoml.arangodb.cloud", \
                        msc.DB_SERVICE_END_POINT : "createDB",\
                        msc.DB_SERVICE_NAME : "createDB",\
                        msc.DB_SERVICE_PORT : 8529,\
                        msc.DB_CONN_PROTOCOL : 'https'}
        
mdb_config = mdb_config.create_connection_config(conn_params)
admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)
ap_config = admin.get_config()
ap = ArangoPipe(config = ap_config)
proj_info = {"name": "Housing_Price_Estimation_Project"}
proj_reg = admin.register_project(proj_info)

In [None]:
import pandas as pd
data_url = "https://raw.githubusercontent.com/arangoml/arangopipe/arangopipe_examples/examples/data/cal_housing.csv"
df = pd.read_csv(data_url, error_bad_lines=False)

In [None]:
run_id = uuid.uuid4()

# Local directory for testing
working_dir = Path('.') / 'tmp_dir' / str(run_id)
working_dir.mkdir(parents=True, exist_ok=True)
print (working_dir)

In [None]:
datapath_object = MLObject()
datapath_object.set_type('0.0.1', MLSchemaTypes.DATAPATH)
datapath_object.data_store = 'Git Repo for Arangopipe'
datapath_object.storage_connection_type = 'CUSTOM'
datapath_object.connection.endpoint = data_url
datapath_object.run_id = str(run_id)
datapath_object.step_id = str(uuid.uuid4())
datapath_object.run_date = str(datetime.datetime.now())
response, errors = datapath_object.save(working_dir)
print(errors)

In [None]:
datapath_object.to_json()

In [None]:
df.head()

In [None]:
ds_info = {"name" : "california-housing-dataset-ml-spec-mapped- " + str(uuid.uuid4()),\
            "description": "This dataset lists median house prices in Califoria. Various house features are provided",\
           "source": "UCI ML Repository", 'ML_Spec_Representation':datapath_object.to_json() }
ds_reg = ap.register_dataset(ds_info)

In [None]:
import numpy as np
df["medianHouseValue"] = df["medianHouseValue"].apply(lambda x: np.log(x))
featureset = df.dtypes.to_dict()
featureset = {k:str(featureset[k]) for k in featureset}
featureset["name"] = "log_transformed_median_house_value-" + str(uuid.uuid4())


In [None]:
datapath_object = MLObject()
datapath_object.set_type('0.0.1', MLSchemaTypes.DATAPATH)
datapath_object.data_store = 'Featureset for California Housing with log transformed house values'
datapath_object.storage_connection_type = 'CUSTOM'
datapath_object.connection.endpoint = data_url
datapath_object.run_id = str(run_id)
datapath_object.step_id = str(uuid.uuid4())
datapath_object.run_date = str(datetime.datetime.now())
response, errors = datapath_object.save(working_dir)
print(errors)

In [None]:
fs_reg = ap.register_featureset(featureset, ds_reg["_key"]) # note that the dataset and featureset are linked here.

In [None]:
from sklearn.model_selection import train_test_split
preds = df.columns.to_list()
preds.remove('medianHouseValue')
X = df[preds].values
Y = df['medianHouseValue'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
clf = linear_model.Lasso(alpha=0.001)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)
train_mse = mean_squared_error(train_pred, y_train)
test_mse = mean_squared_error(test_pred, y_test)

In [None]:
import uuid
import datetime
import jsonpickle
mp = clf.get_params()
mp = jsonpickle.encode(mp)
model_params = {'run_id': str(run_id), 'model_params': mp}

In [None]:
model_object = MLObject()
model_object.set_type('0.0.1', MLSchemaTypes.MODEL)
model_object.run_id = str(run_id)
model_object.step_id = str(uuid.uuid4())
model_object.run_date = str(datetime.datetime.now())
model_object.created_by = str(uuid.uuid4())
model_object.version = "0.0.1"
model_object.time_created = str(datetime.datetime.now())
model_object.description = "Baseline model for House Price Regression"
model_object.name = "California Housing Regression Model-" + str(uuid.uuid4())
model_object.references = mp
response, errors = model_object.save(working_dir)
print(errors)

In [None]:
metrics_object = MLObject()
metrics_object.set_type('0.0.1', MLSchemaTypes.TRAIN_RESULTS)
metrics_object.accuracy = train_mse
metrics_object.global_step = 1
metrics_object.loss = clf.dual_gap_
metrics_object.run_id = str(run_id)
metrics_object.step_id = str(uuid.uuid4())
metrics_object.run_date = str(datetime.datetime.now())
#metrics_object.training_execution_id = str(uuid.uuid4())
response, errors = metrics_object.save(working_dir)
print(errors)

In [None]:
model_info = {"name": "Lasso Model for Housing Dataset" + str(uuid.uuid4()),  "task": "Regression",\
              "ML_Spec_Model_Representation": model_object.to_json()}
model_reg = ap.register_model(model_info, project = "Housing_Price_Estimation_Project")

In [None]:
model_perf = {'training_mse': train_mse, 'test_mse': test_mse,\
              'run_id': str(run_id), "timestamp": str(datetime.datetime.now()),\
              'ML_Spec_Metrics_Representation': metrics_object.to_json()}

In [None]:


# ruuid = str(uuid.uuid4().int)
# model_perf = {'training_mse': train_mse, 'test_mse': test_mse, 'run_id': ruuid, "timestamp": str(datetime.datetime.now())}

# mp = clf.get_params()
# mp = jsonpickle.encode(mp)
# model_params = {'run_id': ruuid, 'model_params': mp}

run_info = {"dataset" : ds_reg["_key"],\
                    "featureset": fs_reg["_key"],\
                    "run_id": str(run_id),\
                    "model": model_reg["_key"],\
                    "model-params": model_params,\
                    "model-perf": model_perf,\
                    "tag": "Housing_Price_Estimation_Project",\
                    "project": "Housing_Price_Estimation_Project"}
ap.log_run(run_info)