In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import metrics
from mlxtend.plotting import plot_decision_regions, plot_confusion_matrix
from IPython.display import Image

In [2]:
house = pd.read_csv('../../datasets/kc_house_data.csv')
house.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
X = house.loc[:, ['bedrooms', 'sqft_living', 'grade', 'waterfront', 'yr_built', 'sqft_basement']]
y = house.loc[:, 'price']

### Train - Test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, 
                                                   random_state = 42)

In [5]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### Visualization
If graphviz is not installed then paste the generated graph in http://webgraphviz.com/ to visualize it.  

In [6]:
dotfile = open("D:/regressor.dot", 'w')
export_graphviz(model, out_file = dotfile, feature_names = X.columns)
dotfile.close()
# Image(filename='D:/regressor_image.PNG') # Tree takes time to build

### Prediction and RMSE

In [7]:
# Train RMSE
est = model.predict(X_train)
rmse = np.sqrt(metrics.mean_squared_error(y_train, est))
print('Train RMSE: ', rmse)

Train RMSE:  28860.57996223972


In [8]:
# Test RMSE
est = model.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, est))
print('Test RMSE: ', rmse)

Test RMSE:  286675.2055023114


### Feature importance

In [9]:
pd.DataFrame(model.feature_importances_*100, index=X.columns, columns=['which feature plays more imp. role?'])

Unnamed: 0,which feature plays more imp. role?
bedrooms,2.857266
sqft_living,33.732786
grade,39.850454
waterfront,2.882267
yr_built,15.228335
sqft_basement,5.448891


## Changing the model parameters 

In [10]:
model_n = DecisionTreeRegressor(max_depth=6, # If None then internal nodes are splitted until 
                                             # leaf nodes have less than min_samples_split samples. 
                                min_samples_split=3, # The minimum number of samples required to split an internal node
                                min_samples_leaf=3,  # The minimum number of samples required to be at a leaf node
                                max_features='auto', # Number of features to be cosidered while splitting 
                                                     # auto = sqrt(n_features)
                               )
model_n.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=6, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [11]:
# Train RMSE
est = model_n.predict(X_train)
rmse = np.sqrt(metrics.mean_squared_error(y_train, est))
print('Train RMSE: ', rmse)
# Test RMSE
est = model_n.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, est))
print('Test RMSE: ', rmse)
# Feature importance
pd.DataFrame(model_n.feature_importances_*100, index=X.columns, columns=['which feature plays more imp. role?'])

Train RMSE:  191606.86524988583
Test RMSE:  229863.73440058585


Unnamed: 0,which feature plays more imp. role?
bedrooms,0.134542
sqft_living,33.189
grade,53.779681
waterfront,2.873081
yr_built,9.461005
sqft_basement,0.562691


## Finding the best paramters

In [12]:
from sklearn.model_selection import GridSearchCV, KFold
param_grid = dict(max_depth=range(1, 11, 2), 
                  min_samples_split=range(2,5), 
                  min_samples_leaf=range(2,5))
Kcv = KFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, verbose=1, cv=Kcv)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:    1.5s finished


GridSearchCV(cv=KFold(n_splits=3, random_state=42, shuffle=True),
       error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(1, 11, 2), 'min_samples_split': range(2, 5), 'min_samples_leaf': range(2, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [13]:
# Train RMSE
est = grid_search.predict(X_train)
rmse = np.sqrt(metrics.mean_squared_error(y_train, est))
print('Train RMSE: ', rmse)
# Test RMSE
est = grid_search.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, est))
print('Test RMSE: ', rmse)

# Best parameters
grid_search.best_params_

Train RMSE:  184199.15828133587
Test RMSE:  231424.3994334205


{'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}