<div class="alert alert-block alert-success">
<b>Kernel Author:</b>  <br>
<a href="https://bhishanpdl.github.io/" , target="_blank">Bhishan Poudel,  Data Scientist, Ph.D Astrophysics</a> .
</div>

# Data Description

This dataset contains house sale prices for King County,
which includes Seattle.
It includes homes sold between May 2014 and May 2015.

- Dependent features: 1 (price)
- Features : 19 home features
- Id:  1 house ID

Task: Estimate the price based on given features.

![](images/data_description.png)


## Model Introduction
Here we will use deep learning method using keras and tensorflow for the regression problem of house price prediction.

# Imports

In [1]:
import time
time_start_notebook = time.time()

In [2]:
%%capture
import os
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:
    ## install modules
    !pip install scikit-plot
    !pip install lrcurve
    !pip install watermark
    !pip install -U scikit-learn

    ## print
    print('Environment: Google Colaboratory.')

In [3]:
import numpy as np
import pandas as pd

# visualization
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# random state
SEED = 100
np.random.seed(SEED)

# sklearn
import sklearn
from sklearn import model_selection
from sklearn import preprocessing

# deep learning
import tensorflow
import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor

# model evaluation
import scikitplot
from scikitplot import metrics as skmetrics
import lrcurve
from lrcurve import KerasLearningCurve

# versions
import watermark
%load_ext watermark
%watermark -a "Bhishan Poudel" -d -v -m
print()
%watermark -iv

Bhishan Poudel 2020-11-04 

CPython 3.6.9
IPython 5.5.0

compiler   : GCC 8.4.0
system     : Linux
release    : 4.19.112+
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit

numpy      1.18.5
scikitplot 0.3.7
sklearn    0.23.2
tensorflow 2.3.0
watermark  2.0.2
matplotlib 3.2.2
keras      2.4.3
seaborn    0.11.0
pandas     1.1.3



# Important Scripts

In [4]:
def set_random_seed(seed):
    import os
    import random
    import numpy as np
    import tensorflow as tf
    
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [5]:
def show_methods(obj, ncols=7,start=None, inside=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """

    print(f'Object Type: {type(obj)}\n')
    lst = [elem for elem in dir(obj) if elem[0]!='_' ]
    lst = [elem for elem in lst 
           if elem not in 'os np pd sys time psycopg2'.split() ]

    if isinstance(start,str):
        lst = [elem for elem in lst if elem.startswith(start)]
        
    if isinstance(start,tuple) or isinstance(start,list):
        lst = [elem for elem in lst for start_elem in start
               if elem.startswith(start_elem)]
        
    if isinstance(inside,str):
        lst = [elem for elem in lst if inside in elem]
        
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [elem for elem in lst for inside_elem in inside
               if inside_elem in elem]

    return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')

In [6]:
def adjustedR2(rsquared,nrows,kcols):
    """
    Adjusted r-squared depends on number of rows and columns of Test data.

    It reduces the value of original r-squared value.
    """
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)

In [7]:
def get_model(params,metrics,n_feats):

    # num of layers
    n_layers = len([i for i in list(params.keys()) if i.endswith('_units')])

    # layers
    model = keras.Sequential(name='Sequential')

    # layer 1
    model.add(keras.layers.Dense(
        params['L1_units'],
        activation=params['L1_act'],
        input_shape=(n_feats,),
        name='Layer_1'
        ))
    model.add(keras.layers.Dropout(params['L1_dropout'],
                                seed=SEED,
                                name='Dropout_1'))

    # middle layers
    for i in range(2,n_layers+1): # 2,3, etc
        model.add(keras.layers.Dense(
            params[f'L{i}_units'],
            activation=params[f'L{i}_act'],
            name=f'Layer_{i}'),)
        model.add(keras.layers.Dropout(
            params[f'L{i}_dropout'],
            seed=SEED,
            name=f"Dropout_{i}"))

    # last layer is dense 1 with activation sigmoid
    model.add(keras.layers.Dense(
        1,
        activation=None, # activation = None or linear does nothing
        name=f'Layer_{n_layers+1}'
        ))

    #=================================================== compile
    model.compile(
        optimizer=keras.optimizers.Adam(lr=params['adam_lr']),
        loss='mse',
        metrics=metrics)

    return model

In [8]:
def print_reg_metrics(yt,yp,ncols):
    rmse = np.sqrt(sklearn.metrics.mean_squared_error(yt,yp))
    r2 = sklearn.metrics.r2_score(yt, yp)
    ar2 = adjustedR2(r2, len(yt), ncols)

    out = f"""
    RMSE     : {rmse:,.2f}
    R-squared: {r2:,.6f}
    Adj R2   : {ar2:,.6f}
    """
    print(out)

# Parameters

In [9]:
if ENV_COLAB:
    path_raw = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/'
    proj = 'Projects/King_County_Seattle_House_Price_Kaggle/'
    data_path_parent = path_raw + proj

else:
    data_path_parent = '../data/'

target = 'price'
cols_drop = ['id', 'date', 'zipcode_top10']
cols_sq = ['bedrooms','bathrooms','floors','waterfront','view',
    'age','age_after_renovation','log1p_sqft_living','log1p_sqft_lot',
    'log1p_sqft_above','log1p_sqft_basement',
    'log1p_sqft_living15','log1p_sqft_lot15']

train_size = 0.8

# Load the data

In [10]:
data_path_clean = data_path_parent + 'processed/data_cleaned_encoded.csv'
df_clean = pd.read_csv(data_path_clean)

print(f"df shape : {df_clean.shape}")
display(df_clean.head(2).append(df_clean.tail(2)))

df shape : (21613, 92)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,yr_sales,age,yr_renovated2,age_after_renovation,zipcode_top10,zipcode_houses,basement_bool,renovation_bool,age_cat,age_after_renovation_cat,waterfront_0,waterfront_1,view_0,view_1,view_2,view_3,view_4,condition_1,condition_2,...,grade_7,grade_8,grade_9,zipcode_top10_98004,zipcode_top10_98006,zipcode_top10_98033,zipcode_top10_98039,zipcode_top10_98040,zipcode_top10_98102,zipcode_top10_98105,zipcode_top10_98155,zipcode_top10_98177,zipcode_top10_others,age_cat_0,age_cat_1,age_cat_2,age_cat_3,age_cat_4,age_cat_5,age_cat_6,age_cat_7,age_cat_8,age_cat_9,age_after_renovation_cat_0,age_after_renovation_cat_1,age_after_renovation_cat_2,age_after_renovation_cat_3,age_after_renovation_cat_4,age_after_renovation_cat_5,age_after_renovation_cat_6,age_after_renovation_cat_7,age_after_renovation_cat_8,age_after_renovation_cat_9,log1p_price,log1p_sqft_living,log1p_sqft_lot,log1p_sqft_above,log1p_sqft_basement,log1p_sqft_living15,log1p_sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,59,1955,59,others,262,0,0,5,5,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,12.309987,7.074117,8.639588,7.074117,0.0,7.201171,8.639588
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,2014,63,1991,23,others,410,1,1,5,2,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,13.195616,7.85205,8.887791,7.682943,5.993961,7.433075,8.941153
21611,291310100,2015-01-16,400000.0,3,2.5,1600,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,2015,11,2004,11,others,412,0,0,1,1,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,12.899222,7.378384,7.77863,7.378384,0.0,7.252054,7.160846
21612,1523300157,2014-10-15,325000.0,2,0.75,1020,1076,2.0,0,0,3,7,1020,0,2008,0,98144,47.5941,-122.299,1020,1357,2014,6,2008,6,others,343,0,0,0,0,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,12.691584,6.928538,6.981935,6.928538,0.0,6.928538,7.213768


# Data Processing

In [18]:
df = df_clean.drop(cols_drop, axis=1)

# for col in cols_sq:
#     df[col + '_sq'] = df[col]**2

df_Xtrain,df_Xtest,ser_ytrain,ser_ytest = model_selection.train_test_split(
    df.drop([target],axis=1),
    df[target],
    train_size=train_size,
    random_state=SEED)

ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()


# scaler = preprocessing.StandardScaler()
scaler = preprocessing.MinMaxScaler()
scaler.fit(df_Xtrain)
Xtrain = scaler.transform(df_Xtrain)
Xtest  = scaler.transform(df_Xtest)

n_feats = Xtrain.shape[-1]

#============================================================
PARAMS_MODEL = {
    # layer 1
    'L1_units': 100,
    'L1_act': 'relu',
    'L1_dropout': 0,

    # layer
    'L2_units': 200,
    'L2_act': 'relu',
    'L2_dropout': 0,

    # layer
    'L3_units': 50,
    'L3_act': 'relu',
    'L3_dropout': 0,

    # optimizer
    'adam_lr': 1e-3,
}

#============================================================
METRICS = ['mse' ]

#============================================================
PARAMS_FIT = {'epochs': 1000,
          'batch_size': 1024,
          'patience': 20,
          'shuffle': True,
          'validation_split': 0.2
          }

#============================================================
# callbacks
cb_early = tf.keras.callbacks.EarlyStopping(
    monitor='val_mse', # val_auc for classification 
    verbose=1,
    patience=PARAMS_FIT['patience'],
    mode='max',
    restore_best_weights=True)

#cb_checkpt = keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")
cb_lr = lrcurve.KerasLearningCurve()
callbacks = [ cb_lr]
#==============================================================================


model = get_model(PARAMS_MODEL,METRICS,n_feats)
print(model.summary())
history = model.fit(
    Xtrain,
    ytrain,
    batch_size=PARAMS_FIT['batch_size'],
    epochs=PARAMS_FIT['epochs'],
    verbose=0,
    callbacks=[cb_early],
    validation_split = PARAMS_FIT['validation_split'],
)


ypreds = model.predict(Xtest).flatten()

print_reg_metrics(ytest,ypreds,Xtest.shape[1])

Model: "Sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Layer_1 (Dense)              (None, 100)               8900      
_________________________________________________________________
Dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
Layer_2 (Dense)              (None, 200)               20200     
_________________________________________________________________
Dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
Layer_3 (Dense)              (None, 50)                10050     
_________________________________________________________________
Dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
Layer_4 (Dense)              (None, 1)                 5