In [1]:
import pandas as pd

ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

# Question 1 :

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import cross_validate

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

linear_regression = make_pipeline(
    StandardScaler(), LinearRegression()
)
tree = DecisionTreeRegressor()

cv_results_linear_regression = cross_validate(
    linear_regression, data_numerical, target, cv=10, return_estimator=True 
)
cv_results_tree = cross_validate(
    tree, data_numerical, target, cv=10, return_estimator=True
)

In [5]:
(cv_results_linear_regression['test_score']>cv_results_tree['test_score']).sum()

9

# Question 2 :

In [41]:
import numpy as np
from sklearn.model_selection import GridSearchCV

params = {"max_depth": np.arange(1, 16)}
search = GridSearchCV(tree, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2,
)

In [42]:
for search_cv in cv_results_tree_optimal_depth["estimator"]:
    print(search_cv.best_params_)

{'max_depth': 8}
{'max_depth': 7}
{'max_depth': 9}
{'max_depth': 7}
{'max_depth': 9}
{'max_depth': 10}
{'max_depth': 6}
{'max_depth': 6}
{'max_depth': 5}
{'max_depth': 7}


# Question 3 :

In [47]:
search = GridSearchCV(tree, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=-1,
)
cv_results_tree_optimal_depth["test_score"].mean()

0.6796029814705333

In [48]:
(cv_results_tree_optimal_depth['test_score']>cv_results_linear_regression['test_score']).sum()

2

# Question 4 :

In [51]:
from sklearn.compose import make_column_selector as selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = selector(dtype_include=object)(data)
numerical_columns = selector(dtype_exclude=object)(data)

preprocessor = make_column_transformer(
    (OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), categorical_columns),
    (StandardScaler(), numerical_columns),
)

tree_without_numerical_data = DecisionTreeRegressor(max_depth=7)
tree_with_numerical_data = make_pipeline(
    preprocessor, DecisionTreeRegressor(max_depth=7)
)

cv_results_tree_without_numerical_data = cross_validate(
    tree_without_numerical_data, data_numerical, target, cv=10, return_estimator=True 
)
cv_results_tree_with_numerical_data = cross_validate(
    tree_with_numerical_data, data, target, cv=10, return_estimator=True
)

In [52]:
(cv_results_tree_without_numerical_data['test_score']>cv_results_tree_with_numerical_data['test_score']).sum()

1