In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import pipelines
from s3_client import s3_client
import columns as c
import common

pd.set_option('float_format', '{:f}'.format)
%load_ext autoreload
%autoreload 2

s3_client = s3_client() 

credentials.py 12:19:51 INFO: Found credentials in shared credentials file: ~/.aws/credentials


In [2]:
_df = s3_client.read_newest_df_from_s3(common.FINAL_DATA_PATH, dtype='sale')

s3_client.py 12:19:53 INFO: Downloading sale/final/sale_final_2019_12_31T00_36_41.parquet from flats-data bucket ...
s3_client.py 12:20:00 INFO: Successfully downloaded sale/final/sale_final_2019_12_31T00_36_41.parquet from flats-data bucket.


In [3]:
df = _df.drop(c.HEATING, axis=1)
print(df.shape)
df = df[:10000]
df = df.dropna()
print(df.shape)
df[c.DATE_ADDED] = pd.to_datetime(df[c.DATE_ADDED]).dt.strftime("%Y%m%d").astype(int)
df[c.DATE_REFRESHED] = pd.to_datetime(df[c.DATE_REFRESHED]).dt.strftime("%Y%m%d").astype(int)

(240238, 41)
(8834, 41)


In [4]:
X = df.drop([c.PRICE, c.PRICE_M2, c.OFFER_ID], axis=1)
rent_cols = [
    c.CLUSTER_COORDS_FACTOR,
    c.BUILDING_HEIGHT,
    c.SIZE,
    c.FLOOR,
    c.BUILDING_YEAR,
    c.VIEW_COUNT,
    c.DESC_LEN,
    c.FLOOR_N,
    c.LAT,
    c.LON,
]
sale_cols = [
    c.CLUSTER_COORDS_FACTOR,
    c.SIZE,
    c.CLUSTER_ID,
    c.CLUSTER_MEAN_PRICE_M2,
    c.LAT,
    c.BUILDING_YEAR,
    c.LON,
    c.CLUSTER_CENTER_DIST_KM,
    c.DESC_LEN,
    c.VIEW_COUNT,
]
X = X[sale_cols]
y = df[c.PRICE_M2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(4, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 2,
    cv = 2,
    verbose=3,
    random_state=42,
    n_jobs = -1,
)

# Fit the random search model
rf_random.fit(X_train, y_train)

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    print('Model Performance')
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('MAPE = {:0.2f}%.'.format(mape))
    return mape

# Fit the grid search to the data
best_rf = rf_random.best_estimator_
mape = evaluate(best_rf, X_test, y_test)

feature_importances = (pd.DataFrame(best_rf.feature_importances_,
                                    index = X_train.columns,
                                    columns=['importance'])
                       .sort_values('importance', ascending=False))
feature_importances

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.3min finished


Model Performance
Average Error: 805.8518
MAPE = 11.61%.


Unnamed: 0,importance
cluster_coords_factor__feature,0.503717
cluster_id__feature,0.114612
cluster_mean_price_m2__feature,0.098042
size__offer,0.085443
lat__offer,0.039779
building_year__offer,0.036698
view_count__offer,0.034601
lon__offer,0.034504
cluster_center_dist_km__feature,0.029299
desc_len__offer,0.023305


In [6]:
s3_client.upload_model_to_s3_with_timestamp(best_rf,
                                            common.MODELS_PATH,
                                            dtype='sale',
                                            keyword='rf',
                                            metadata={'MAPE': '{:0.3f}%.'.format(mape)},
                                           )

s3_client.py 12:21:45 INFO: Sending sale/models/sale_rf_2019_12_31T12_21_45.joblib to flats-models bucket...
s3_client.py 12:22:03 INFO: Successfully uploaded sale/models/sale_rf_2019_12_31T12_21_45.joblib to flats-models bucket.


True

In [10]:
model = s3_client.read_model_from_s3('flats-models/sale/models/sale_rf_2019_12_31T12_21_45.joblib')

s3_client.py 12:25:03 INFO: Downloading sale/models/sale_rf_2019_12_31T12_21_45.joblib from flats-models bucket ...
s3_client.py 12:25:20 INFO: Successfully downloaded sale/models/sale_rf_2019_12_31T12_21_45.joblib from flats-models bucket.


In [11]:
pred = model.predict(df.loc[4:10, sale_cols])

In [12]:
pred
model = s3_client.read_model_from_s3('flats-models/sale/models/sale_rf_2019_12_31T12_21_45.joblib')

array([3243.22693343, 4206.17902351, 8147.23622611, 8284.82632176,
       7473.95792847, 5116.14065348])

In [13]:
model = s3_client.read_newest_model_from_s3(common.MODELS_PATH, 'sale')

s3_client.py 12:32:23 INFO: Downloading sale/models/sale_rf_2019_12_31T12_21_45.joblib from flats-models bucket ...
s3_client.py 12:33:50 INFO: Successfully downloaded sale/models/sale_rf_2019_12_31T12_21_45.joblib from flats-models bucket.
