In [1]:
import os
from datetime import datetime

os.environ['AWS_ACCESS_KEY_ID']='minioadmin'
os.environ['AWS_SECRET_ACCESS_KEY']='minioadmin'

model_id = datetime.utcnow().strftime("%y_%d_%m_%H_%M_%S_%f")
model_dir = os.path.join('s3://models', model_id)

In [2]:
#Download data and save it to S3
from s3fs import S3FileSystem
from sklearn.datasets import fetch_california_housing

housing_df = fetch_california_housing(download_if_missing=True, as_frame=True).frame

fs = S3FileSystem(key='minioadmin', secret='minioadmin', client_kwargs={'endpoint_url': "http://localhost:9000/"})
with fs.open('s3://data/housing_df.csv', 'wb') as f:
    f.write(housing_df.to_csv(index=False).encode())

In [3]:
#Make sure we can read it back
from s3fs import S3FileSystem
import pandas as pd
fs = S3FileSystem(key='minioadmin', secret='minioadmin', client_kwargs={'endpoint_url': "http://localhost:9000/"})

with fs.open('s3://data/housing_df.csv', 'rb') as f:
    housing_df = pd.read_csv(f)

In [4]:
#Feature engineering
from sklearn.preprocessing import StandardScaler
from joblib import dump

target = 'MedHouseVal'
X = housing_df.drop(target, axis=1)
y = housing_df[target]

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
metrics_df = pd.DataFrame(scaler.mean_, index=X.columns)[0].to_dict()

#Save scalar for later monitoring and eval
with fs.open(model_dir+'/scalar.joblib', 'wb') as f:
    dump([metrics_df, scaler], f) 

In [5]:
#Train a model
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_validate
import numpy as np
import pandas as pd
from joblib import dump

target = 'MedHouseVal'
model = RidgeCV(alphas=np.logspace(-3, 1, num=30))
reg = model.fit(X, y)

with fs.open(model_dir+'/ridgecv.joblib', 'wb') as f:
    dump(model, f) 

In [6]:
#Score the data
from joblib import load

with fs.open(model_dir+'/ridgecv.joblib', 'rb') as f:
    loaded_model = load(f) 

X['pred'] = loaded_model.predict(X)
X[target] = y


In [7]:
#Write data with predictions to S3
from joblib import dump, load

with fs.open('s3://data/housing_pred.csv', 'wb') as f:
    f.write(X.to_csv(index=False).encode())