# Decision trees vs linear models

In [1]:
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.datasets import make_friedman1
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

make_regression - генерируется случайная линейная зависимость


make_friedman1 - y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1) - нелинейная зависимость.

## Линейная зависимость

In [2]:
X_data, y_data = make_regression(n_samples=1000, noise=100, n_features=10)

Меняем максимальную глубину дерева

In [3]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=1), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-40954.514785903775

In [4]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=5), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-31533.293532954773

In [5]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-36092.273998457524

Меняем минимальное количество примеров в листе

In [6]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=2), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-36510.420180183304

In [7]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-30334.863645455207

In [8]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=20), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-30657.362108256166

Подберем оптимальные параметры

In [9]:
%%time
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        'max_depth': range(1, 21, 3),
        'min_samples_leaf': range(1, 21, 3),
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print(gs.best_params_)
print(gs.best_score_)



{'criterion': 'mae', 'max_depth': 10, 'min_samples_leaf': 10}
-29978.8485487
Wall time: 8.1 s




Сравним с линейной регрессией

In [10]:
np.mean(cross_val_score(
    LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'
))

-10554.024236726773

При линейной зависимости между признаками и таргетом LinearRegression показал себя лучше, чем DT(внезапно)

## Нелинейная зависимость

In [11]:
X_data, y_data = make_friedman1(n_samples=1000, noise=10, n_features=10)

Меняем максимальную глубину дерева

In [12]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=1), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-123.45743926150215

In [13]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=5), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-135.41769807336149

In [14]:
np.mean(cross_val_score(
    DecisionTreeRegressor(max_depth=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-202.54709354032602

Меняем минимальное количество примеров в листе

In [15]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=2), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-210.79926624624576

In [16]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=10), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-154.53933935637258

In [17]:
np.mean(cross_val_score(
    DecisionTreeRegressor(min_samples_leaf=20), 
    X_data, y_data, 
    cv=5, scoring='neg_mean_squared_error',
))

-134.21699179910081

Подберем оптимальные параметры

In [18]:
%%time
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        'max_depth': range(1, 21, 3),
        'min_samples_leaf': range(1, 21, 3),
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print(gs.best_params_)
print(gs.best_score_)



{'criterion': 'mse', 'max_depth': 4, 'min_samples_leaf': 19}
-118.525260733
Wall time: 8.55 s




Сравним с линейной регрессией

In [19]:
np.mean(cross_val_score(
    LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'
))

-113.38650270146995

При нелинейной зависимости между признаками и таргетом LinearRegression сравним с DT

## Оценка времени работы

In [20]:
X_data, y_data = make_regression(n_samples=100000, noise=1000, n_features=30, random_state=42)

In [21]:
%%time
DecisionTreeRegressor(max_depth=1).fit(X_data, y_data)

Wall time: 705 ms


DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [22]:
%%time
DecisionTreeRegressor(max_depth=2).fit(X_data, y_data)

Wall time: 1.49 s


DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [23]:
%%time
DecisionTreeRegressor(max_depth=4).fit(X_data, y_data)

Wall time: 2.76 s


DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [24]:
%%time
DecisionTreeRegressor(max_depth=10).fit(X_data, y_data)

Wall time: 5.46 s


DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [25]:
%%time
LinearRegression().fit(X_data, y_data)

Wall time: 231 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Преимущества и Недостатки решающих деревьев:

**Преимущества**
 * хорошо интерпретируются
 * легко обобщаются для регрессии и классификации
 * допускаются разнотипные данные
 
**Недостатки**
 * Сравнение с линейными алгоритмами на линейно разделимой выборке - фиаско
 * Переобучение
 * Неустойчивость к шуму, составу выборки, критерию
 
**Способы устранения недостатков**
 * прунинг (усечение)
 * композиции (леса) деревьев

#### Pruning

<img src='img/pruning.png' Width=800>