In [105]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [106]:
data = pd.read_csv(load_boston()['filename'], skiprows=1)
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [107]:
X = data.drop(columns=['MEDV'])
y = data['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Линейная регрессия

In [108]:
lin_reg = LinearRegression().fit(X_train, y_train)

In [109]:
lin_reg.score(X_test, y_test)

0.6687594935356279

# Дерево решений

In [110]:
default_depth = DecisionTreeRegressor().fit(X_train, y_train).get_depth()

def get_dtr_score(depth, leaves):
    sum_score = 0
    n = 10
    for i in range(n):
        dtr = DecisionTreeRegressor(max_depth=depth, min_samples_leaf=leaves).fit(X_train, y_train)
        sum_score += dtr.score(X_test, y_test)
        
    return sum_score / n

In [111]:
score = []
for depth in range(default_depth):
    a = [get_dtr_score(depth+1, leaves+1) for leaves in range(20)]
    score.append(a)
    
score = np.array(score)

In [112]:
res = np.where(score == np.amax(score))
coords = list(zip(res[0], res[1]))[0]
print('Optimum')
print('Depth: {}, Min number of samples in leaves: {}, \nScore: {}'.format(coords[0] + 1, coords[1] + 1, 
                                                                         get_dtr_score(coords[0] + 1, coords[1] + 1)))
print('-------------------\nLinear Regression score:', lin_reg.score(X_test, y_test))

Optimum
Depth: 7, Min number of samples in leaves: 3, 
Score: 0.8847578253578587
-------------------
Linear Regression score: 0.6687594935356279


In [113]:
print('DEPTH\t|LEAVES\n---------------')
print('     \t|{}'.format([_+1 for _ in range(score.shape[1])]))
for depth in range(score.shape[0]):
    print('{}    \t|{}'.format(depth+1, score[depth]))

DEPTH	|LEAVES
---------------
     	|[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
1    	|[0.3602157 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157
 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157
 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157 0.3602157]
2    	|[0.64554957 0.64554957 0.64554957 0.64554957 0.64554957 0.64554957
 0.64554957 0.64554957 0.64554957 0.64554957 0.64554957 0.64554957
 0.64554957 0.64554957 0.64554957 0.64554957 0.64554957 0.64554957
 0.64554957 0.64554957]
3    	|[0.80709666 0.79501241 0.81461802 0.80547311 0.71759757 0.71616719
 0.71506919 0.70595595 0.70595595 0.70595595 0.70595595 0.70595595
 0.70589529 0.69995525 0.69995525 0.69995525 0.69995525 0.69995525
 0.69995525 0.69995525]
4    	|[0.81733225 0.77011727 0.85156974 0.84225613 0.72074948 0.71942503
 0.71629093 0.73178022 0.72881799 0.72817192 0.71698547 0.70856518
 0.71051005 0.7084466  0.7084466  0.7084466  0.7084466  0.7084466
 0.708446