# ADS-508-01-SP23 Team 8: Final Project

# Train model

Much of the code is modified from `Fregly, C., & Barth, A. (2021). Data science on AWS: Implementing end-to-end, continuous AI and machine learning pipelines. O’Reilly.`

## Install missing dependencies

[PyAthena](https://pypi.org/project/PyAthena/) is a Python DB API 2.0 (PEP 249) compliant client for Amazon Athena.

In [2]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
!pip install --disable-pip-version-check -q sagemaker-experiments==0.1.26
!pip install missingno
!pip install scikit-optimize

[0mCollecting missingno
  Downloading missingno-0.5.2-py3-none-any.whl (8.7 kB)
Installing collected packages: missingno
Successfully installed missingno-0.5.2
[0mCollecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m222.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0
[0m

## Globally import libraries

In [3]:
import boto3
from botocore.client import ClientError
import pandas as pd
import numpy as np
from pyathena import connect
from IPython.core.display import display, HTML
import missingno as msno
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Lasso
import datetime as dt
import time
import sagemaker
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
import joblib
import os
from io import BytesIO

%matplotlib inline

## Instantiate AWS SageMaker and S3 sessions

In [4]:
session = boto3.session.Session()
role = sagemaker.get_execution_role()
region = session.region_name
sagemaker_session = sagemaker.Session()
def_bucket = sagemaker_session.default_bucket()
bucket = 'sagemaker-us-east-ads508-sp23-t8'

s3 = boto3.Session().client(service_name="s3",
                            region_name=region)

sm = boto3.Session().client(service_name="sagemaker",
                            region_name=region)

In [5]:
print(f"Default bucket: {def_bucket}")
print(f"Public T8 bucket: {bucket}")

Default bucket: sagemaker-us-east-1-657724983756
Public T8 bucket: sagemaker-us-east-ads508-sp23-t8


## Pass in train and test X from CSV

In [6]:
s3_train_x01_csv_path = f"s3://{def_bucket}/team_8_data/modeling_data/training/train_x01.csv"
train_x01 = pd.read_csv(s3_train_x01_csv_path)
s3_test_x01_csv_path = f"s3://{def_bucket}/team_8_data/modeling_data/testing/test_x01.csv"
test_x01 = pd.read_csv(s3_test_x01_csv_path)

print(f'{train_x01.shape}')
print(f'\n{test_x01.shape}')

(25284, 48)

(6321, 48)


## Pass in train and test y from np array

In [7]:
# Define the S3 object key
train_y01_s3_key = 'team_8_data/modeling_data/training/train_y01.npy'

# Load the numpy array from S3
with BytesIO() as data:
    s3.download_fileobj(def_bucket, train_y01_s3_key, data)
    data.seek(0)
    train_y01 = np.load(data)

# Define the S3 object key
test_y01_s3_key = 'team_8_data/modeling_data/testing/test_y01.npy'

# Load the numpy array from S3
with BytesIO() as data:
    s3.download_fileobj(def_bucket, test_y01_s3_key, data)
    data.seek(0)
    test_y01 = np.load(data)

train_y01 = train_y01.ravel()
test_y01 = test_y01.ravel()

# Confirm that the numpy array was loaded from S3
print(f'{train_y01.shape}')
print(f'{test_y01.shape}')

(25284,)
(6321,)


## Model Training using Grid search with 5-fold cross-validation

### Neural Network

In [8]:
# Start timer script
start_time = dt.datetime.today()

# Citation: Hochberg, 2018; Shanmukh, 2021
m1v1_nn_pip = Pipeline([('si', SimpleImputer(strategy='median')),
                     ('ss', StandardScaler()),
                     ('nn', MLPRegressor(random_state=1699))])

nodes_h = 3
predictors_p = 49

hidden_layer_sizes_hparam = [[100,],
                             [(nodes_h*(predictors_p+1))+nodes_h+1,],
                             [50, 50]
                            ]
activation_hparam = ['logistic', 'relu']
solver_hparam = ['adam']
alpha_hparam = [.0001, .0005, .001]
learn_rate_hparam = ['constant', 'invscaling']

#hidden_layer_sizes_hparam = [[100,]]
#activation_hparam = ['relu']
#solver_hparam = ['adam']
#alpha_hparam = [.0001]
#learn_rate_hparam = ['invscaling']

m1v1_nn_grd = {'nn__hidden_layer_sizes': hidden_layer_sizes_hparam,
            'nn__activation': activation_hparam,
            'nn__solver': solver_hparam,
            'nn__alpha': alpha_hparam,
            'nn__learning_rate': learn_rate_hparam
           }

m1v1_nn = GridSearchCV(m1v1_nn_pip,
                       m1v1_nn_grd,
                       scoring='neg_root_mean_squared_error',
                       n_jobs=2,
                       refit=True,
                       verbose=2)

m1v1_nn.fit(train_x01, train_y01)

print(f'Best Estimator:\n{m1v1_nn.best_estimator_}')

print(pd.DataFrame(m1v1_nn.cv_results_))

train_m1v1_nn_y01_pred = m1v1_nn.predict(train_x01)
print(train_m1v1_nn_y01_pred)

test_m1v1_nn_y01_pred = m1v1_nn.predict(test_x01)
print(test_m1v1_nn_y01_pred)

# Display evaluation metrics
# R-sq
train_m1v1_nn_r2 = r2_score(train_y01, train_m1v1_nn_y01_pred)
test_m1v1_nn_r2 = r2_score(test_y01, test_m1v1_nn_y01_pred)

print(f'Train R-sq:\n{train_m1v1_nn_r2}')
print(f'Test R-sq:\n{test_m1v1_nn_r2}')

# RMSE
train_m1v1_nn_rmse = mean_squared_error(train_y01, train_m1v1_nn_y01_pred, squared=False)
test_m1v1_nn_rmse = mean_squared_error(test_y01, test_m1v1_nn_y01_pred, squared=False)

print(f'Train RMSE:\n{train_m1v1_nn_rmse}')
print(f'Test RMSE:\n{test_m1v1_nn_rmse}')

# End timer script
end_time = dt.datetime.today()
time_elapse = end_time - start_time
print(f'End Time = {end_time}')
print(f'Script Time = {time_elapse}')

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 12.0min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 45.9min
[Parallel(n_jobs=2)]: Done 180 out of 180 | elapsed: 51.7min finished


Best Estimator:
Pipeline(memory=None,
         steps=[('si',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('nn',
                 MLPRegressor(activation='relu', alpha=0.0005,
                              batch_size='auto', beta_1=0.9, beta_2=0.999,
                              early_stopping=False, epsilon=1e-08,
                              hidden_layer_sizes=[50, 50],
                              learning_rate='constant',
                              learning_rate_init=0.001, max_fun=15000,
                              max_iter=200, momentum=0.9, n_iter_no_change=10,
                              nesterovs_momentum=True, power_t=0.5,
                              random_state=1699, shuffle=True, solver='adam',
       



In [9]:
s3_m1v1_nn_pqt_base_path = f"../models"

if not os.path.exists(s3_m1v1_nn_pqt_base_path):
    os.makedirs(s3_m1v1_nn_pqt_base_path)

s3_m1v1_nn_pqt_path = os.path.join(s3_m1v1_nn_pqt_base_path,
                                   'm1v1_nn.parquet')

# save the model to disk using joblib
joblib.dump(m1v1_nn,
            s3_m1v1_nn_pqt_path)

# load the saved model from disk using joblib
m1v1_nn_fitted = joblib.load(s3_m1v1_nn_pqt_path)

In [10]:
# specify the S3 bucket and key where you want to save the model
m1v1_nn_key_name = 'team_8_data/models/m1v1_nn.parquet'

# save the model to an in-memory buffer
buffer = BytesIO()
joblib.dump(m1v1_nn, buffer)

# upload the buffer to S3
buffer.seek(0)
s3.upload_fileobj(buffer, def_bucket, m1v1_nn_key_name)

# load the saved model from S3
#buffer = BytesIO()
#s3.download_fileobj(def_bucket, m1v1_nn_key_name, buffer)
#buffer.seek(0)
#m1v1_nn_fitted = joblib.load(buffer)

### Lasso - Using `GridSearchCV`

In [11]:
# Start timer script
start_time = dt.datetime.today()

# Citation: Hochberg, 2018; Shanmukh, 2021
m2v1_ls_pip = Pipeline([('si', SimpleImputer(strategy='median')),
                        ('ss', StandardScaler()),
                        ('ls', Lasso(random_state=1699))])

alpha_hparam = [.01, .05, .1, .5, 1, 2]
selection_hparam = ['cyclic', 'random']


m2v1_ls_grd = {'ls__alpha': alpha_hparam,
               'ls__selection': selection_hparam
           }

m2v1_ls = GridSearchCV(m2v1_ls_pip,
                       m2v1_ls_grd,
                       scoring='neg_root_mean_squared_error',
                       n_jobs=2,
                       refit=True,
                       verbose=2)

m2v1_ls.fit(train_x01, train_y01)

print(f'Best Estimator:\n{m2v1_ls.best_estimator_}')
print(f'Coefficients:\n{m2v1_ls.best_estimator_.named_steps["ls"].coef_}')

print(pd.DataFrame(m2v1_ls.cv_results_))

train_m2v1_ls_y01_pred = m2v1_ls.predict(train_x01)
print(train_m2v1_ls_y01_pred)

test_m2v1_ls_y01_pred = m2v1_ls.predict(test_x01)
print(test_m2v1_ls_y01_pred)

# Display evaluation metrics
# R-sq
train_m2v1_ls_r2 = r2_score(train_y01, train_m2v1_ls_y01_pred)
test_m2v1_ls_r2 = r2_score(test_y01, test_m2v1_ls_y01_pred)

print(f'Train R-sq:\n{train_m2v1_ls_r2}')
print(f'Test R-sq:\n{test_m2v1_ls_r2}')

# RMSE
train_m2v1_ls_rmse = mean_squared_error(train_y01, train_m2v1_ls_y01_pred, squared=False)
test_m2v1_ls_rmse = mean_squared_error(test_y01, test_m2v1_ls_y01_pred, squared=False)

print(f'Train RMSE:\n{train_m2v1_ls_rmse}')
print(f'Test RMSE:\n{test_m2v1_ls_rmse}')

# End timer script
end_time = dt.datetime.today()
time_elapse = end_time - start_time
print(f'End Time = {end_time}')
print(f'Script Time = {time_elapse}')

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   15.4s
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:   18.6s finished


Best Estimator:
Pipeline(memory=None,
         steps=[('si',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ls',
                 Lasso(alpha=0.01, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=1699, selection='random',
                       tol=0.0001, warm_start=False))],
         verbose=False)
Coefficients:
[-0.04698528  1.1655262   0.         -1.02395897  1.13308637 -0.
 -0.          0.          0.02814286 -0.         -0.0307876   0.
  0.         -0.         -0.01445511  0.         -0.          2.92859099
  2.50110937  5.60822761  1.40168756 -2.34296164 -1.18943262 -0.33822063
 -1.3657179  -3.4737577  -6.793536

In [12]:
coef_intercept = np.hstack((m2v1_ls.best_estimator_.named_steps["ls"].coef_,
                            m2v1_ls.best_estimator_.named_steps["ls"].intercept_))
#print(coef_intercept)

coef_intercept_df01 = pd.DataFrame(coef_intercept)
#display(coef_intercept_df01)

train_x01_col_names = list(train_x01.columns)
train_x01_col_names.append('intercept')

train_x01_col_names_df01 = pd.DataFrame(train_x01_col_names)
#display(train_x01_col_names_df01)

model_params = pd.concat([train_x01_col_names_df01, coef_intercept_df01], axis=1)
display(model_params)

Unnamed: 0,0,0.1
0,borough_bronx,-0.046985
1,borough_brooklyn,1.165526
2,borough_manhattan,0.0
3,borough_queens,-1.023959
4,borough_staten island,1.133086
5,relative_data_year_-4,-0.0
6,relative_data_year_-3,-0.0
7,relative_data_year_-2,0.0
8,relative_data_year_-1,0.028143
9,relative_data_year_0,-0.0


In [13]:
# specify the S3 bucket and key where you want to save the model
m2v1_ls_key_name = 'team_8_data/models/m2v1_ls.parquet'

# save the model to an in-memory buffer
buffer = BytesIO()
joblib.dump(m2v1_ls, buffer)

# upload the buffer to S3
buffer.seek(0)
s3.upload_fileobj(buffer, def_bucket, m2v1_ls_key_name)

# load the saved model from S3
#buffer = BytesIO()
#s3.download_fileobj(def_bucket, m2v1_ls_key_name, buffer)
#buffer.seek(0)
#m2v1_ls_fitted = joblib.load(buffer)

### Lasso - Using `BayesSearchCV`

In [14]:
# Start timer script
start_time = dt.datetime.today()

# Citation: Hochberg, 2018; Shanmukh, 2021
m2v2_ls_pip = Pipeline([('si', SimpleImputer(strategy='median')),
                        ('ss', StandardScaler()),
                        ('ls', Lasso(random_state=1699))])

alpha_hparam = Real(1e-3, 1e3, prior='log-uniform')
selection_hparam = Categorical(['cyclic', 'random'])
max_iter_hparam = Integer(100, 5000, prior='log-uniform')
warm_start_hparam = Categorical([False, True])


m2v2_ls_grd = {'ls__alpha': alpha_hparam,
               'ls__selection': selection_hparam,
               'ls__max_iter': max_iter_hparam,
               'ls__warm_start': warm_start_hparam
           }

m2v2_ls = BayesSearchCV(m2v2_ls_pip,
                       m2v2_ls_grd,
                       scoring='neg_root_mean_squared_error',
                        cv=5,
                       n_jobs=2,
                       refit=True,
                       verbose=2)

m2v2_ls.fit(train_x01, train_y01)

print(f'Best Estimator:\n{m2v2_ls.best_estimator_}')
print(f'Coefficients:\n{m2v2_ls.best_estimator_.named_steps["ls"].coef_}')

print(pd.DataFrame(m2v2_ls.cv_results_))

train_m2v2_ls_y01_pred = m2v2_ls.predict(train_x01)
print(train_m2v2_ls_y01_pred)

test_m2v2_ls_y01_pred = m2v2_ls.predict(test_x01)
print(test_m2v2_ls_y01_pred)

# Display evaluation metrics
# R-sq
train_m2v2_ls_r2 = r2_score(train_y01, train_m2v2_ls_y01_pred)
test_m2v2_ls_r2 = r2_score(test_y01, test_m2v2_ls_y01_pred)

print(f'Train R-sq:\n{train_m2v2_ls_r2}')
print(f'Test R-sq:\n{test_m2v2_ls_r2}')

# RMSE
train_m2v2_ls_rmse = mean_squared_error(train_y01, train_m2v2_ls_y01_pred, squared=False)
test_m2v2_ls_rmse = mean_squared_error(test_y01, test_m2v2_ls_y01_pred, squared=False)

print(f'Train RMSE:\n{train_m2v2_ls_rmse}')
print(f'Test RMSE:\n{test_m2v2_ls_rmse}')

# End timer script
end_time = dt.datetime.today()
time_elapse = end_time - start_time
print(f'End Time = {end_time}')
print(f'Script Time = {time_elapse}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.6s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   14.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    7.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    7.5s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    4.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    4.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    8.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    8.4s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    6.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    6.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   26.6s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   26.6s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   10.9s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   10.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    5.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    5.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   16.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   16.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    4.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.5s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    8.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    8.4s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.4s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.8s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   14.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   14.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    2.0s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.1s finished


Best Estimator:
Pipeline(memory=None,
         steps=[('si',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ls',
                 Lasso(alpha=0.00312004499292689, copy_X=True,
                       fit_intercept=True, max_iter=5000, normalize=False,
                       positive=False, precompute=False, random_state=1699,
                       selection='cyclic', tol=0.0001, warm_start=False))],
         verbose=False)
Coefficients:
[-5.90772722e-02  1.20356958e+00  0.00000000e+00 -1.03729787e+00
  1.13319125e+00 -0.00000000e+00 -0.00000000e+00  2.94787994e-03
  3.38749310e-02 -2.05060234e-04 -2.82296287e-02  2.06826899e-02
  0.00000000e+00 -0.00000000e+00 -4.34594017e-02  0.00000000e+00
 -0.00000000e+00  8.48098006e+00  0

In [15]:
coef_intercept = np.hstack((m2v2_ls.best_estimator_.named_steps["ls"].coef_,
                            m2v2_ls.best_estimator_.named_steps["ls"].intercept_))
#print(coef_intercept)

coef_intercept_df01 = pd.DataFrame(coef_intercept)
#display(coef_intercept_df01)

train_x01_col_names = list(train_x01.columns)
train_x01_col_names.append('intercept')

train_x01_col_names_df01 = pd.DataFrame(train_x01_col_names)
#display(train_x01_col_names_df01)

model_params = pd.concat([train_x01_col_names_df01, coef_intercept_df01], axis=1)
display(model_params)

Unnamed: 0,0,0.1
0,borough_bronx,-0.059077
1,borough_brooklyn,1.20357
2,borough_manhattan,0.0
3,borough_queens,-1.037298
4,borough_staten island,1.133191
5,relative_data_year_-4,-0.0
6,relative_data_year_-3,-0.0
7,relative_data_year_-2,0.002948
8,relative_data_year_-1,0.033875
9,relative_data_year_0,-0.000205


In [16]:
# specify the S3 bucket and key where you want to save the model
m2v2_ls_key_name = 'team_8_data/models/m2v2_ls.parquet'

# save the model to an in-memory buffer
buffer = BytesIO()
joblib.dump(m2v2_ls, buffer)

# upload the buffer to S3
buffer.seek(0)
s3.upload_fileobj(buffer, def_bucket, m2v2_ls_key_name)

# load the saved model from S3
#buffer = BytesIO()
#s3.download_fileobj(def_bucket, m2v2_ls_key_name, buffer)
#buffer.seek(0)
#m2v2_ls_fitted = joblib.load(buffer)

## Release Resources

In [17]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [18]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>