# [Wrap-up quiz 5](https://inria.github.io/scikit-learn-mooc/trees/trees_wrap_up_quiz.html)

In [1]:
import pandas as pd

ames_housing = pd.read_csv('./datasets/ames_housing_no_missing.csv')
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [7]:
model_linear = make_pipeline(StandardScaler(), LinearRegression())
model_tree = DecisionTreeRegressor(random_state=0)

In [11]:
cv_linear = cross_validate(model_linear, data_numerical, target, cv=10, return_estimator=True)
cv_tree = cross_validate(model_tree, data_numerical, target, cv=10, return_estimator=True)

In [13]:
np.count_nonzero(cv_linear['test_score'] > cv_tree['test_score'])

9

By comparing the cross-validation test scores for both models fold-to-fold, one can see that the linear model is substantially better than the decision tree

---

In [16]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

In [52]:
param_grid = {'max_depth': np.arange(1, 16)}

inner_cv = KFold(n_splits=10, shuffle=True, random_state=0)
outer_cv = KFold(n_splits=10, shuffle=True, random_state=0)

model_tree = DecisionTreeRegressor(random_state=0)
model_tree_opt = GridSearchCV(
    estimator=model_tree, param_grid=param_grid, cv=inner_cv, n_jobs=4
)

In [53]:
cv_tree_opt = cross_validate(
    model_tree_opt, data_numerical, target, cv=outer_cv, return_estimator=True
)
cv_linear = cross_validate(model_linear, data_numerical, target, cv=outer_cv, return_estimator=True)

In [54]:
[est.best_params_['max_depth'] for est in cv_tree_opt['estimator']]

[8, 5, 8, 8, 5, 7, 5, 6, 7, 5]

The optimal depth is ranging from 5 to 8

---

In [56]:
for r in range(4):
    model_tree = DecisionTreeRegressor(random_state=r)
    
    
    model_tree_opt = GridSearchCV(
        estimator=model_tree, param_grid=param_grid, cv=inner_cv, n_jobs=4
    )
    
    cv_tree_opt = cross_validate(
        model_tree_opt, data_numerical, target, 
        cv=outer_cv, return_estimator=True
    )
    cv_linear = cross_validate(
        model_linear, data_numerical, target, 
        cv=outer_cv, return_estimator=True)
    print("Number of times the linear model is better than the tree with optimal depth: ")
    print(np.count_nonzero(cv_linear['test_score'] > cv_tree_opt['test_score']))

Number of times the linear model is better than the tree with optimal depth: 
7
Number of times the linear model is better than the tree with optimal depth: 
7
Number of times the linear model is better than the tree with optimal depth: 
8
Number of times the linear model is better than the tree with optimal depth: 
7


A tree with tuned depth is often but not always worse than the linear model

---

In [65]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector as selector

In [71]:
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)

categorical_preprcessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('ordinalencoder', categorical_preprcessor, categorical_columns),
    ('numerical', numerical_preprocessor, numerical_columns)
])
for r in range(5):
    model = make_pipeline(
        preprocessor,
        DecisionTreeRegressor(max_depth=7, random_state=r)
    )
    
    cv_tree = cross_validate(
        model, data, target, cv=10, return_estimator=True
    )
    
    cv_linear = cross_validate(model_linear, data_numerical, target, 
                             cv=10, return_estimator=True)
    print("Number of times the linear model is better than the tree with optimal depth: ")
    print(np.count_nonzero(cv_linear['test_score'] > cv_tree_opt['test_score']))

Number of times the linear model is better than the tree with optimal depth: 
7
Number of times the linear model is better than the tree with optimal depth: 
7
Number of times the linear model is better than the tree with optimal depth: 
7
Number of times the linear model is better than the tree with optimal depth: 
7
Number of times the linear model is better than the tree with optimal depth: 
7


A tree model trained with both numerical and categorical features is most often worse than the tree model using only the numerical features