In [1]:
from ingestion import make_cleaned, make_train_test
from metadata_helpers import save_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.regressor import ResidualsPlot
from sklearn.metrics import mean_squared_error


In [3]:
X_train, X_test, y_train, y_test = make_train_test()

categoric_cols = X_train.select_dtypes(include=object).columns.tolist()
OH_encoder = SklearnTransformerWrapper(transformer=OneHotEncoder(
    sparse_output=False, drop='if_binary', min_frequency=0.1, handle_unknown='ignore'), variables=categoric_cols)
pipe = make_pipeline(OH_encoder,
                     DecisionTreeRegressor(random_state=42))
ccp_range = np.linspace(0,0.08,num=7)
depth_range = np.arange(1, 15)
gs = GridSearchCV(n_jobs=3, estimator=pipe, cv=10,
                  scoring='neg_root_mean_squared_error', param_grid={'decisiontreeregressor__max_depth': depth_range, 'decisiontreeregressor__ccp_alpha' : ccp_range})


In [4]:
gs.fit(X_train, y_train)




In [5]:
gs.best_params_


{'decisiontreeregressor__ccp_alpha': 0.0,
 'decisiontreeregressor__max_depth': 8}

In [None]:
pipe_validation = make_pipeline(
    OH_encoder, DecisionTreeRegressor(random_state=42)
)
viz = ValidationCurve(
    pipe_validation, cv=10, param_name='decisiontreeregressor__max_depth', param_range=depth_range
)
# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show()


In [None]:
pipe_final = make_pipeline(
    OH_encoder, DecisionTreeRegressor(random_state=42, max_depth=8)
)
visualizer = ResidualsPlot(pipe_final, hist=False, qqplot=True)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()


In [None]:
rmse_test = mean_squared_error(
    pipe_final.predict(X_test), y_test, squared=False)
print(f"rmse = {rmse_test}")


In [None]:
save_score(pipe_final, rmse_test)
  
# Individual decision trees can perform better when the complexity is limited below what a validation parameter scan suggests. 
# They generalize better if we prefer a parsimonius tree. 

# Plot first few levels of tree

In [None]:
from sklearn.tree import plot_tree
plot_tree(decision_tree=pipe_final._final_estimator, max_depth=2,
          proportion=True, feature_names=pipe_final._final_estimator.feature_names_in_,)
fig = plt.gcf()
fig.set_size_inches(9, 10.5)
fig.set_dpi(220)
