In [1]:
import pandas as pd
import numpy as np

In [15]:
df=pd.read_csv("https://raw.githubusercontent.com/aastha12/Machine-Learning/master/Regression/Linear%20Regression/housing.csv")

In [16]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [17]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Missing values

In [18]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [19]:
import plotly.express as px
import plotly.graph_objects as go
px.box(df['total_bedrooms'])

Since there are a lot of outliers, we will fill the missing values with median.

In [20]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(),inplace=True)
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

## One hot encoding

In [21]:
one_hot_df=pd.get_dummies(df["ocean_proximity"])

df=pd.concat([df,one_hot_df],axis=1)
df.drop(['ocean_proximity'],axis=1,inplace=True)
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0


We do not need to scale the data (source:https://stats.stackexchange.com/questions/72231/decision-trees-variable-feature-scaling-and-variable-feature-normalization#:~:text=1%20Answer&text=For%201%2C%20decision%20trees%20in,or%20other%20methods%20like%20SVM.) or detect and treat outliers(source:https://datascience.stackexchange.com/questions/37394/are-decision-trees-robust-to-outliers#:~:text=Yes.,is%20a%20point%20from%20lines.&text=Most%20likely%20outliers%20will%20have,not%20on%20their%20absolute%20values).) for decision trees.

## Train/Test split

In the interest of preventing information about the distribution of the test set leaking into your model, you should fit the scaler on your training data only, then standardise both training and test sets with that scaler. For this reason, we will first perform train/test split and then feature scaling.

In [22]:
from sklearn.model_selection import train_test_split

X=df.copy()
X.drop(['median_house_value'],inplace=True,axis=1)

y=df[['median_house_value']]

X=np.array(X)
y=np.array(y)

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25)
print("Shape of training set features:",X_train.shape)
print("Shape of training set labels:",y_train.shape)
print("Shape of testing set features:",X_test.shape)
print("Shape of testing set labels:",y_test.shape)

Shape of training set features: (15480, 13)
Shape of training set labels: (15480, 1)
Shape of testing set features: (5160, 13)
Shape of testing set labels: (5160, 1)


In [23]:
y_train=y_train.ravel()
y_test=y_test.ravel()
print("Shape of training set labels:",y_train.shape)
print("Shape of testing set labels:",y_test.shape)

Shape of training set labels: (15480,)
Shape of testing set labels: (5160,)


## Fit the model

In [35]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

tree=DecisionTreeRegressor(random_state=0) #random_state is the seed value, just to make sure we both get same results.
tree.fit(X_train,y_train)
predicted_train=tree.predict(X_train)

print("RMSE is:",np.sqrt(mean_squared_error(y_train,predicted_train)))

RMSE is: 0.0


Since the RMSE is 0, decision tree has clearly overfit the training set.

## Hyperparameter Optimization 

In [25]:
tree

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

Increasing
min_* hyperparameters or reducing max_* hyperparameters will regularize the
model.


In [39]:
from sklearn.model_selection import GridSearchCV

param_dist = {"max_depth": list(range(3,6)), # currently, it is None
              "max_leaf_nodes": list(range(2, 100)),
              "min_samples_split":[2,3,4], #currently it is 2
              "max_features": list(range(3,6)), #currently it is None. Take from square root of number of features to 30-40% of total number of features
              "min_samples_leaf":[2,3,4]} #currently it is 1

tree_tuned = GridSearchCV(tree, param_dist,cv=5,scoring='neg_mean_squared_error')
tree_tuned.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=0,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5], 'max_features': [3, 4, 5],
                         'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16,

In [40]:
# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_tuned.best_params_))
print("Best MSE score is {}".format(-tree_tuned.best_score_))
print("Best RMSE score is {}".format(np.sqrt(-tree_tuned.best_score_)))

Tuned Decision Tree Parameters: {'max_depth': 5, 'max_features': 3, 'max_leaf_nodes': 32, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best MSE score is 6133662519.940824
Best RMSE score is 78317.7024684766


You can see that the RMSE score on training set is 78317 after hyperparameter tuning.

## Test Set

In [41]:
yhat=tree_tuned.predict(X_test)
print("RMSE on test set is:",np.sqrt(mean_squared_error(y_test,yhat)))

RMSE on test set is: 79487.66856906176


Surprisingly, the fine tuned LinearRegression() did better than Decision Trees. Generally, you should use Random Forest over decision trees.