In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

import pipelines
from s3_client import s3_client
import columns as c
import common

# data pipelines s3 paths
S3_DATA_BUCKET = "flats-data"
DATA_TYPES = ("sale", "rent")
RAW_DATA_PATH = S3_DATA_BUCKET + "/{data_type}/raw"
CONCATED_DATA_PATH = S3_DATA_BUCKET + "/{data_type}/concated"
CLEAN_DATA_PATH = S3_DATA_BUCKET + "/{data_type}/clean"
FINAL_DATA_PATH = S3_DATA_BUCKET + "/{data_type}/final"
PREDICTED_DATA_PATH = S3_DATA_BUCKET + "/{data_type}/predicted"
TO_UPLOAD_DATA_PATH = S3_DATA_BUCKET + "/{data_type}/to_upload"

# models s3 paths
S3_MODELS_BUCKET = "flats-models"
COORDS_MAP_MODELS_PATH = S3_MODELS_BUCKET + "/{data_type}/coords_encoding"
MODELS_PATH = S3_MODELS_BUCKET + "/{data_type}/models"

pd.set_option('float_format', '{:f}'.format)
%load_ext autoreload
%autoreload 2

s3_client = s3_client() 

DTYPE = 'sale'

credentials.py 15:51:56 INFO: Found credentials in shared credentials file: ~/.aws/credentials


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
_df = s3_client.read_newest_df_from_s3(FINAL_DATA_PATH, dtype=DTYPE)

s3_client.py 15:51:58 INFO: Downloading sale/final/sale_final_2020_12_18T16_12_19.csv from flats-data bucket ...
s3_client.py 15:52:41 INFO: Successfully downloaded sale/final/sale_final_2020_12_18T16_12_19.csv from flats-data bucket.


In [11]:
df = _df.drop(c.HEATING, axis=1)
print(df.shape)
df = df.dropna()
print(df.shape)
df[c.DATE_ADDED] = pd.to_datetime(df[c.DATE_ADDED]).dt.strftime("%Y%m%d").astype(int)
df[c.DATE_REFRESHED] = pd.to_datetime(df[c.DATE_REFRESHED]).dt.strftime("%Y%m%d").astype(int)

(514459, 46)
(514458, 46)


In [12]:
X = df.drop([c.PRICE, c.PRICE_M2, c.OFFER_ID], axis=1)

if DTYPE == 'sale':
    X = X[common.SALE_MODEL_INPUTS]
elif DTYPE == 'rent':
    X = X[common.RENT_MODEL_INPUTS]
    
y = df[c.PRICE_M2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 10)]
max_depth = [int(x) for x in np.linspace(6, 50, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10, 100]
min_samples_leaf = [1, 2, 4]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 10,
    cv = 2,
    verbose=3,
    random_state=42,
    n_jobs = -1,
)

# Fit the random search model
rf_random.fit(X_train, y_train)

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    print('Model Performance')
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    print('MAPE = {:0.2f}%.'.format(mape))
    return mape

# Fit the grid search to the data
best_rf = rf_random.best_estimator_
mape = evaluate(best_rf, X_test, y_test)

feature_importances = (pd.DataFrame(best_rf.feature_importances_,
                                    index = X_train.columns,
                                    columns=['importance'])
                       .sort_values('importance', ascending=False))
feature_importances

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Model Performance
Average Error: 687.3527
MAPE = 9.56%.


Unnamed: 0,importance
cluster_coords_factor__feature,0.592084
lat__offer,0.090092
lon__offer,0.087457
size__offer,0.066362
building_year__offer,0.054468
desc_len__offer,0.036349
building_height__offer,0.02275
floor__offer,0.020998
view_count__offer,0.018677
floor_number__clean,0.010763


In [14]:
s3_client.upload_model_to_s3_with_timestamp(best_rf,
                                            common.MODELS_PATH,
                                            dtype=DTYPE,
                                            keyword='rf',
                                            metadata={'MAPE': '{:0.3f}%.'.format(mape)},
                                           )

s3_client.py 16:42:37 INFO: Sending sale/models/sale_rf_2021_01_14T16_42_06.joblib to flats-models bucket...
s3_client.py 16:45:17 INFO: Successfully uploaded sale/models/sale_rf_2021_01_14T16_42_06.joblib to flats-models bucket.


True