In [2]:
import numpy as np
import pandas as pd
import os

In [19]:
# Data preparation
ROOTDIR = 'datasets'
FILENAME = 'housing.csv'
FILEPATH = os.path.join(ROOTDIR, FILENAME)

df = pd.read_csv(FILEPATH)
df = df.dropna()
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20433.0,-119.570689,2.003578,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20433.0,35.633221,2.136348,32.54,33.93,34.26,37.72,41.95
housing_median_age,20433.0,28.633094,12.591805,1.0,18.0,29.0,37.0,52.0
total_rooms,20433.0,2636.504233,2185.269567,2.0,1450.0,2127.0,3143.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20433.0,1424.946949,1133.20849,3.0,787.0,1166.0,1722.0,35682.0
households,20433.0,499.433465,382.299226,1.0,280.0,409.0,604.0,6082.0
median_income,20433.0,3.871162,1.899291,0.4999,2.5637,3.5365,4.744,15.0001
median_house_value,20433.0,206864.413155,115435.667099,14999.0,119500.0,179700.0,264700.0,500001.0


In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Encoding for ocean_proximity
le = LabelEncoder()
df['ocean_proximity_encoded'] = le.fit_transform(df.ocean_proximity.values)

In [29]:
# Create datasets
y_col = ['median_house_value']
not_included_cols = ['longitude', 'latitude', 'ocean_proximity']
X_cols = df.columns.difference(y_col + not_included_cols)

X = df[X_cols]
y = df[y_col]

scale_features = True

# Scaler
scaler = StandardScaler()
if scale_features:
    X = scaler.fit_transform(X)
    y = scaler.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((13690, 7), (13690, 1), (6743, 7), (6743, 1))

In [64]:
# Gridsearch for the best regression model
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators':[2,5,10,20,50],
    'criterion':['mse'],
    'max_depth':[2,4,6,8,10],
    'min_samples_split':[2,3],
    'min_samples_leaf':[1,2,3],
    'bootstrap': [False, True],
    'n_jobs':-1,
    }

clf = GridSearchCV(RandomForestRegressor(), 
    parameters, 
    verbose=1,
    scoring='neg_mean_squared_error')
clf.fit(X_train, y_train.reshape(-1))
best_model = clf.best_estimator_

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [61]:
# Evaluation with test set
from sklearn.metrics import mean_squared_error

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
inversed_mse = scaler.inverse_transform([mse])
print("Error +-:", np.sqrt(inversed_mse), "USD")

Error +-: [491.4650148] USD


In [62]:
# Check feature importance from the datasets
features_importance = zip(X_cols, best_model.feature_importances_)
list(sorted(features_importance, key=lambda x:x[1], reverse=True))

[('median_income', 0.6209427846163105),
 ('ocean_proximity_encoded', 0.13378168559696907),
 ('housing_median_age', 0.07177020499640131),
 ('population', 0.0661787746304712),
 ('total_bedrooms', 0.050338252197811836),
 ('households', 0.032145451286201686),
 ('total_rooms', 0.024842846675834467)]