In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

from sklearn import metrics

In [3]:
df = pd.read_csv('BostonHousing.csv')

In [4]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [6]:
x = df.iloc[:, 0:13]
y = df.iloc[:, -1]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [20]:
clf = DecisionTreeRegressor(criterion='poisson', max_depth=3)

In [21]:
clf.fit(x_train, y_train)

In [22]:
y_pred = clf.predict(x_test)

In [23]:
r2_score(y_test, y_pred)

0.8577972143811856

# Hyperparameter Tuning

In [24]:
param_grid = {
    'max_depth' : [3, 5, 7, 9],
    'criterion' : ['poisson', 'friedman_mse', 'absolute_error'],
    'max_features' : [0.25, 0.5, 1.0],
    'min_samples_split' : [0.25, 0.5, 1.0]
}

In [26]:
reg = GridSearchCV(DecisionTreeRegressor(), param_grid)

In [27]:
reg.fit(x_train, y_train)

In [28]:
reg.best_score_

0.7201826312636849

In [29]:
reg.best_params_

{'criterion': 'poisson',
 'max_depth': 7,
 'max_features': 1.0,
 'min_samples_split': 0.25}

# Feature Importance

In [30]:
for importance, name in sorted(zip(clf.feature_importances_, x.columns), reverse=True):
    print(name, "=", importance)

lstat = 0.700207188676701
rm = 0.2250167011042414
dis = 0.04922910383558282
nox = 0.02554700638347491
zn = 0.0
tax = 0.0
rad = 0.0
ptratio = 0.0
indus = 0.0
crim = 0.0
chas = 0.0
b = 0.0
age = 0.0
