In [1]:
# run stopwatch
from tools import Stopwatch
stopwatch = Stopwatch()
start = stopwatch.start()

### Load libraries, functions, palette, theme

In [2]:
%run _libraries.ipynb

In [3]:
%run _functions.ipynb

In [4]:
dir_save = 'Section8-Final-predictions'

# Section VIII. Final predictions

## Load Saved Section if exists

## Load Data

In [5]:
# dicts
datasets_dict = loadit(
    'datasets_dict', dir='Section3-Feature-selection-and-Preprocessing')
datasets_dict_raw = loadit(
    'datasets_dict', dir='Section1-Overview-and-Base-model')
features_dict = loadit(
    'features_dict', dir='Section4-Linear-models-research')
groups_dict = loadit(
    'groups_dict', dir='Section2-Explore-and-Clean')
transform_dict = loadit(
    'transform_dict', dir='Section3-Feature-selection-and-Preprocessing')
simulation_datasets_dict = loadit(
    'simulation_datasets_dict', dir='Section7-ML-models')
estimators_dict = loadit(
    'estimators_dict', dir='Section7-ML-models')
evaluation_dict = loadit(
    'evaluation_dict', dir='Section7-ML-models')

# test dataset
test = datasets_dict['test']

# target variable
target='price'

# overview transformers
transformer_overview = transform_dict['transformer_overview']
transformer_imputer_median = transform_dict['transformer_imputer_median']
transformer_imputer_frequent = transform_dict['transformer_imputer_frequent']
transformer_label = transform_dict['transformer_label']

# feature selection transformers
transformer_features_creator = transform_dict['transformer_features_creator']
transformer_features_logger = transform_dict['transformer_features_logger']

# preprocessing transformers
encoder = transform_dict['encoder']
scaler = transform_dict['scaler']

# additional features lists
features_na = transform_dict['features_na']
features_log = transform_dict['features_log']

# models
lgb_base = estimators_dict['lgb_base']
lgb_clean = estimators_dict['lgb_iter1']
lgb_nf = estimators_dict['lgb_iter2']
enet = estimators_dict['enet']
stack = estimators_dict['stack_ml']
svr = estimators_dict['svr_ml']
hpp = estimators_dict['hpp']

ModuleNotFoundError: No module named '_classes'

## Tests Datasets for Sections

In [None]:
test.shape

In [None]:
test.head()

### Log Target Variable

In [None]:
test['price'] = np.log(test['price'])

In [None]:
y_test = test['price'].copy()

### Overview Transformation and Discrepancy Check

#### Transformation

In [None]:
test = transformer_overview.transform(test)

#### Discrepancy

##### Garage Discrepancy

In [None]:
features_garage = groups_dict['features_garage']

In [None]:
check_garage_discrepancy(test, features_garage)

##### Basement Discrepancy

In [None]:
features_bsmt = groups_dict['features_bsmt']

In [None]:
check_bsmt_discrepancy(test, features_bsmt)

##### Square Feet Features

In [None]:
features_square = groups_dict['features_square'].copy()

In [None]:
square_feet_check_discrepancy(test, features_square)

##### Year Features

In [None]:
features_year = groups_dict['features_year']

In [None]:
year_check_discrepancy(test, features_year)

### Dataset Base (Section I)

#### Transformation Pipeline

In [None]:
pipeline_s1 = Pipeline(steps=[
    ('transformer_imputer_num', transformer_imputer_median),
    ('transformer_imputer_cat', transformer_imputer_frequent),
    ('transformer_label', transformer_label),
    ('encoder', encoder)
])

#### Transform Test Dataset

In [None]:
test_s1 = pipeline_s1.transform(test)

In [None]:
test_s1.head(3)

### Dataset after Cleaning with Original Features (Section II)

#### Transformation Pipeline

In [None]:
pipeline_s2 = Pipeline(steps=[
    ('transformer_imputer_num', transformer_imputer_median),
    ('transformer_imputer_cat', transformer_imputer_frequent),
    ('transformer_label', transformer_label),
    ('encoder', encoder)
])

#### Transform Test Dataset

In [None]:
test_s2 = pipeline_s2.transform(test)

In [None]:
test_s2.head(3)

### Dataset with added New Features and no log-transform (Section III)

#### Transformation Pipeline

In [None]:
pipeline_s3 = Pipeline(steps=[
    ('transformer_numeric', transformer_imputer_median),
    ('transformer_categorical', transformer_imputer_frequent),
    ('transformer_label', transformer_label),
    ('transformer_create', transformer_features_creator),
    ('transformer_logarithm', transformer_features_logger),
    ('encoder', encoder)
])

#### Transform Test Dataset

In [None]:
test_s3 = pipeline_s3.transform(test)

In [None]:
test_s3 = order_X_y(test_s3, 'price')

In [None]:
test_s3.head(3)

### Dataset with added New Features and no log-transform (Section IV, VI, VII)

#### Transformation Pipeline

In [None]:
pipeline_s467 = Pipeline(steps=[
    ('transformer_overview', transformer_overview),
    ('transformer_numeric', transformer_imputer_median),
    ('transformer_categorical', transformer_imputer_frequent),
    ('transformer_label', transformer_label),
    ('transformer_create', transformer_features_creator),
    ('transformer_logarithm', transformer_features_logger),
    ('encoder', encoder),
    ('scaler', scaler)
   
])

#### Transform Test Dataset

In [None]:
test_s467 = pipeline_s467.transform(test)

In [None]:
test_s467 = order_X_y(test_s467, 'price')

In [None]:
test_s467.head(3)

## Features for Sections

In [None]:
features_base = simulation_datasets_dict['features_base']
features_orig = simulation_datasets_dict['features_iter1']
features_new = simulation_datasets_dict['features_iter2']
features_enet = simulation_datasets_dict['features_enet']
features_svr = simulation_datasets_dict['features_ml_svr']
features_stack = simulation_datasets_dict['features_ml_stack']
features_hpp = simulation_datasets_dict['features_hpp']

In [None]:
# for base and orig estimator will use features orig as it equals
set(features_base) == set(features_orig)

In [None]:
# for hhp wil use featurs new
set(features_new) == set(features_hpp)

In [None]:
# features for LR estimator
print(features_enet)

In [None]:
features_s1 = features_base.copy()

In [None]:
features_s2 = features_base.copy()

In [None]:
features_s3 = features_new.copy()

In [None]:
features_s4 = features_enet.copy()

In [None]:
features_s67 = features_new.copy()

## Check if Estimators are fitted

In [None]:
estimators_check_list = [
    lgb_base, lgb_clean, lgb_nf, svr, stack, hpp
]

In [None]:
check_fit(
    names_list=['LGB Base', 'LGB Orig', 'LGB New', 'SVR', 'Stacking', 'HPP'],
    estimators_list=estimators_check_list
)

## Predictions on Test Dataset

In [None]:
estimators_names = [
    'Base', 'Base (Clean)', 'Base (FE)',
    'Elastic Net', 'SVM', 'Stacking', 'HPP'
]

In [None]:
estimators_list = [
    lgb_base, lgb_clean, lgb_nf,
    enet, svr, stack, hpp
]

In [None]:
datasets_list = [
    test_s1, test_s2, test_s3,
    test_s467, test_s467, test_s467, test_s467
]

In [None]:
features_list = [
    features_s1, features_s2, features_s3,
    features_s4, features_svr, features_stack, features_s67
]

In [None]:
zip_ = zip(
    estimators_names, estimators_list, datasets_list, features_list)

In [None]:
scores_test_price_dict = {}
scores_test_mean_dict = {}
for name_, estimator_, data_, features_ in zip_:
    y_pred_ = estimator_.predict(data_[features_])
    scores_test_price_dict[name_] = y_pred_
    rmse_ = mean_squared_error(y_test, y_pred_, squared=False)
    scores_test_mean_dict[name_] = rmse_

In [None]:
scores_test_price = pd.DataFrame(scores_test_price_dict)

In [None]:
scores_test_mean = pd.DataFrame(scores_test_mean_dict, index=[0])

In [None]:
scores_test_mean = scores_test_mean.T.rename(columns={0: 'rmse_mean_test'})

## CV results from previous Sections

In [None]:
cv_base = evaluation_dict['cv_base']
cv_clean = evaluation_dict['cv_iter1']
cv_nf = evaluation_dict['cv_iter2']
cv_enet = evaluation_dict['cv_enet']
cv_svr = evaluation_dict['cv_svr_ml']
cv_stack = evaluation_dict['cv_stack_ml']
cv_hpp = evaluation_dict['cv_hpp']

In [None]:
cv_list = [
    cv_base, cv_clean, cv_nf,
    cv_enet, cv_svr
]

In [None]:
cv_list2 = [
    cv_stack, cv_hpp
]

In [None]:
estimators_names

In [None]:
scores_cv_mean_dict = {}
scores_cv_folds = pd.DataFrame(columns=estimators_names[:-2])
for name_, cv_ in zip(estimators_names[:-2], cv_list):
    cv_df_ = pd.DataFrame(cv_.cv_results_).sort_values('rank_test_score')
    scores_df_ = cv_df_.filter(regex=r"split\d*_test_score").iloc[0].T
    scores_cv_folds[name_] = scores_df_
    rmse_ = cv_df_['mean_test_score'].iloc[0]
    rmse_std_ = cv_df_['std_test_score'].iloc[0]
    fit_time_ = cv_df_['mean_fit_time'].iloc[0]
    fit_time_std_ = cv_df_['std_fit_time'].iloc[0]
    scores_cv_mean_dict[name_] = [
        rmse_, rmse_std_, fit_time_, fit_time_std_
    ]

In [None]:
scores_cv_mean = pd.DataFrame(
    data=scores_cv_mean_dict,
    index=[
        'rmse_mean_cv', 'rmse_std',
        'fit_time', 'fit_time_std'
]).T

In [None]:
for name_, cv_ in zip(estimators_names[-2:], cv_list2):

    scores_cv_folds[name_] = cv_['test_score']
    
    rmse_mean_ = cv_['test_score'].mean()
    rmse_std_ = cv_['test_score'].std(ddof=1)
    fit_time_ = cv_['fit_time'].mean()
    fit_time_std_ = cv_['fit_time'].std(ddof=1)
    row = [rmse_mean_, rmse_std_, fit_time_, fit_time_std_]
    scores_cv_mean.loc[len(scores_cv_mean), scores_cv_mean.columns] = row

In [None]:
scores_cv_mean.index = estimators_names
# change minus in HPP results to plus
scores_cv_mean.loc['HPP', 'rmse_mean_cv'] = \
    -scores_cv_mean.loc['HPP', 'rmse_mean_cv']

scores_cv_mean = scores_cv_mean.sort_values('rmse_mean_cv', ascending=False)

# change minus in HPP results to plus
scores_cv_folds['HPP'] = -scores_cv_folds['HPP']
scores_cv_folds.index = \
    [re.sub(r'([split, _test_score])', '', i) for i in scores_cv_folds.index]

In [None]:
scores_mean = scores_test_mean.join(scores_cv_mean)
scores_mean['rmse_mean_test'] = -scores_mean['rmse_mean_test']

In [None]:
scores_cv_ci = pd.DataFrame(
    data=ci_bootstrap(scores_cv_folds),
    index=estimators_names
)
scores_mean['proxi_margin'] = scores_cv_ci['proxi_margin']

## Visualization

In [None]:
scores_mean

In [None]:
scores_cv_folds.head()

In [None]:
# xticks params
xtickslabels = list(scores_mean.index)
xticks = np.arange(0, len(xtickslabels))
xticks_delta = 0.1
xticks_cv = xticks
xticks_test = xticks + xticks_delta
# xaxis limits params
xlim_delta = 0.35
xlim_min = 0 - xlim_delta
xlim_max = len(xtickslabels) - 1 + xlim_delta
# erros for errorbars
pi_error = 2*scores_mean['rmse_std']
ci_error = scores_mean['proxi_margin']
# color 
plot_palette = [palette[0]]*(len(scores_mean.index))
lgd_alpha = 1 

In [None]:
fig = plt.figure(figsize=(7, 2.5))
sns.barplot(
    x=xticks,
    y=-scores_mean['rmse_mean_test'],
    width=0.4,
    errorbar=('ci', 95),
    err_kws={'color': palette[-1]},
    color=palette[0]
)
plt.xlim(-0.5, 6.5)
plt.ylim(0.089, 0.131)
plt.yticks(np.arange(0.090, 0.1301, 0.01))
plt.ylabel('RMSE')
plt.title('RMSE on Test Dataset')
plt.axhline(
    y=0.100, xmin=0.02, xmax=0.98,
    lw=0.75, linestyle=':', color=palette[-1])
# plt.grid(False)

# plt.tick_params(bottom=False, left=False)
axis_rstyle(y_ticks=[0.08, 0.14, 0.02], grid=False)
plt.gca().spines['bottom'].set_visible(False)
plt.tick_params(axis='x', bottom=False, pad=5)
plt.xticks(
    ticks=xticks, labels=xtickslabels,
    weight='bold', fontsize=9, color='0.3')

plt.show()

savefig('final_bar', dir_save)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(9, 4), height_ratios=[1, 1])

# ax 0
# scatterplots
ax[0].scatter(
    x=xticks_cv, y=scores_mean['rmse_mean_cv'], s=5, ec='face', color=palette[0])
ax[0].scatter(
    x=xticks_test, y=scores_mean['rmse_mean_test'], s=14, ec='none', color=palette[1])
# connection lines
ax[0].plot(
    xticks_cv, scores_mean['rmse_mean_cv'], lw=0.5, ls=':', color=palette[0], alpha=0.45)
ax[0].plot(
    xticks_test, scores_mean['rmse_mean_test'], lw=0.5, ls=':', color=palette[1], alpha=0.45)
# pi intervals
ax[0].errorbar(
    x=xticks_cv, y=scores_mean['rmse_mean_cv'], yerr=pi_error,
    linestyle='none', capsize=1.5, lw=0.5, color=palette[0], alpha=1)
# ci intervals
ax[0].errorbar(
        x=xticks_cv, y=scores_mean['rmse_mean_cv'], yerr=ci_error,
        linestyle='none', capsize=0, lw=2, color=palette[0])

axis_rstyle(
    y_ticks=[-0.200, -0.050, 0.05],
    y_lim=[-0.20, -0.05],
    ax=ax[0])
ax[0].spines['bottom'].set_visible(False)

# labels and title
ax[0].set_xlabel(None)
# ax[0].set_title('Train and Test RMSE', pad=25)
ax[0].set_ylabel('RMSE')
# axes limits
ax[0].set_xlim(xlim_min, xlim_max)

# ticks (models names)
ax[0].set_xticks(
    ticks=xticks, labels=xtickslabels, weight='bold',
    fontsize=9)
# models names color
ax[0].tick_params(axis='x', bottom=False, labelcolor=palette[0], pad=-3)
# grid
ax[0].grid(False)

# legend handles
handle_cv = Line2D(
    [], [], label='Train RMSE (20-fold CV)', marker='o',
    markersize=3, color=palette[0], linestyle='None', alpha=lgd_alpha)
handle_test = Line2D(
    [], [], label='Test RMSE', marker='o',
    markersize=3, color=palette[1], linestyle='None', alpha=lgd_alpha)
handle_pi = Line2D(
    [], [], label='Prediction intervals (95%)', lw=1,
    color=palette[0], alpha=0.75)
handle_ci = Line2D(
    [], [], label='Confidence intervals (95%)', lw=1.75,
    color=palette[0], alpha=0.95)
# legend
ax[0].legend(
    handles=[handle_cv, handle_test, handle_ci, handle_pi], ncols=4,
    fontsize=8, alignment='left', markerscale=1,
    handletextpad=0.75, handlelength=0.75,
    bbox_to_anchor=(-0.025, 1.1), loc='lower left', frameon=False)

# ax 1
ax[1].scatter(
    x=xticks_test, y=scores_mean['rmse_mean_test'], 
    s=14, ec='none', lw=0.5, color=palette[1], zorder=5)
sns.swarmplot(
    scores_cv_folds, size=3, palette=palette[:1]*7,
    zorder=1, alpha=0.75, ax=ax[1])

axis_rstyle(
    y_ticks=[-0.200, -0.050, 0.050],
    y_lim=[-0.20, -0.05],
    ax=ax[1])
ax[1].spines['bottom'].set_visible(False)

# move xticks to top
ax[1].tick_params(
    top=False, labeltop=False, bottom=False,
    labelbottom=False, pad=5)
# ylabel
ax[1].set_ylabel('RMSE')
# axes limits
ax[1].set_xlim(xlim_min, xlim_max)

# grid
ax[1].grid(False)

# subplots adjust
plt.subplots_adjust(hspace=0.2)
plt.show()

savefig('final_plot', dir_save, dpi=100, transparent=True)

In [None]:
fig = plot_estimators_comparing(
    data=scores_cv_folds[['Stacking', 'HPP']],
    labels=['Stacking', 'HPP'],
    ylabels=['RMSE', 'Delta'],
    kind='bar',
    width=0.15,
    palette=[alpha_color(palette[0], 0.85), palette[2]],
    ax0_y_ticks=[-0.20, -0.05, 0.05],
    ax1_y_ticks=[-0.010, 0.010, 0.005],
    grid=True
)
savefig('stack_hpp_compare', dir_save)

## Comparing Predicted and Actual values

In [None]:
data_pred_actual = \
    test_s1.join(scores_test_price_dict['HPP'].to_frame())

In [None]:
data_pred_actual = \
    data_pred_actual.rename(columns={'price': 'actual_log', 0: 'predicted_log'})

In [None]:
data_pred_actual[['actual', 'predicted']] = \
    data_pred_actual[['actual_log', 'predicted_log']].apply(np.exp)

In [None]:
data_pred_actual['predicted'] = data_pred_actual['predicted'].round()

In [None]:
data_pred_actual['residual'] = \
    data_pred_actual['actual'] - data_pred_actual['predicted']
data_pred_actual['residual_log'] = \
    data_pred_actual['actual_log'] - data_pred_actual['predicted_log']

In [None]:
data_pred_actual['residual_frac'] = data_pred_actual['residual'] / data_pred_actual['actual']

In [None]:
data_pred_actual.shape

In [None]:
data_pred_actual.head()

In [None]:
fig = plot_estimators_comparing(
    data=data_pred_actual.sort_values('grlivarea')[['predicted', 'actual']],
    labels=['Predicted', 'Actual'],
    kind='line',
    figsize=(10, 4),
    markersize=1,
    linewidth=0.25,
    palette=[alpha_color(palette[0], 0.85), palette[3]],
    ax0_y_ticks=[0, 800000, 200000],
    ax1_y_ticks=[-100000, 150000, 50000],
    x_ticks_weight='medium',
    ticks_step=5
)

savefig('predicted_actual_residuals', dir_save, dpi=100, format='png')

#### Residuals

In [None]:
fig, ax = plt.subplots(figsize=(8, 2.5))
ax.hist(x=data_pred_actual['residual'], bins=50)
ax.grid(False)
axis_rstyle(x_ticks=(-100000, 150000, 50000), y_ticks=(0, 30, 5))
plt.show()

savefig('hpp_residuals', dir_save)

In [None]:
test_normality(data_pred_actual['residual'])

In [None]:
cond1 = (data_pred_actual['residual'] > -50000)
cond2 = (data_pred_actual['residual'] < 50000)

In [None]:
columns_for_cutted = [
    'id', 'actual_log', 'predicted_log', 'actual', 'predicted', 'residual'
]
data_pred_actual.loc[~(cond1 & cond2), columns_for_cutted]

In [None]:
# data with residuals less than 50000 and more than -50000
data_pred_actual_cutted = data_pred_actual.loc[(cond1 & cond2), :].copy()
data_pred_actual_cutted = data_pred_actual_cutted.rename(columns={'residual': 'residual cutted'})

In [None]:
test_normality(data_pred_actual_cutted['residual cutted'])

In [None]:
f = plt.figure(figsize=(8, 2.5))
plt.hist(x=data_pred_actual_cutted['residual cutted'], bins=50)
axis_rstyle(x_ticks=(-50000, 50000, 10000), y_ticks=(0, 12, 2))
plt.grid(False)
plt.show()

In [None]:
normal_dist = np.random.normal(
    loc=0, scale=data_pred_actual_cutted['residual cutted'].std(ddof=1), size=140)

In [None]:
fig = plt.figure(figsize=(10, 2.5))

sns.kdeplot(
    normal_dist, lw=0.01, fill=True, color=alpha_color(palette[0], 0.75),
    label='Normal distribution')
sns.kdeplot(
    data_pred_actual['residual'], color=palette[1],
    label='Residuals (original)')
sns.kdeplot(
    data_pred_actual_cutted['residual cutted'], color=palette[2],
    label='Residuals (trimmed)')

plt.legend(loc='lower left', bbox_to_anchor=(0, 1.05), frameon=False, ncols=3)
plt.title(
    'Comparison of Residual distributions with and without outliers',
    loc='left', pad=32)
axis_rstyle(
    x_ticks=(-150000, 200000, 50000), y_ticks=(0, 3.5*10**-5, 0.5*10**-5))
plt.grid(False)
plt.xlabel(None)
plt.show()

savefig('residuals_distributions_compare', dir_save)

In [None]:
fig = plt.figure(figsize=(8, 2.5))

sns.scatterplot(
    data=data_pred_actual.sort_values('actual'),
    x=data_pred_actual['actual'],
    y=data_pred_actual['predicted']
)

plt.axline([0, 0], [1, 1], lw=1, ls='--', color=palette[1])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.xlim(left=0)
plt.ylim(bottom=0, top=750000)
plt.grid(False)
axis_rstyle(
    x_ticks=[0, 800000, 100000],
    y_ticks=[0, 700000, 100000])
plt.show()

savefig('actual_predicted', dir_save)

In [None]:
f = plt.figure(figsize=(8, 2.5))

sns.scatterplot(
    data=data_pred_actual.sort_values('actual'),
    x=data_pred_actual['actual'],
    y=data_pred_actual['residual']
)
plt.axline([0, 0], [1, 0], lw=1, ls='--', color=palette[1])
plt.xlim(left=0)
plt.ylim(bottom=-180000, top=180000)
plt.grid(False)
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 2.5))
sns.scatterplot(
    data=data_pred_actual.sort_values('grlivarea'),
    x=data_pred_actual['grlivarea'],
    y=data_pred_actual['residual'],
    zorder=10
)
plt.axvline(1875, 0.1, 0.9, lw=1, ls='--', color=palette[1], zorder=9)
plt.xlabel('Grlivarea')
plt.ylabel('Residuals')
# plt.ylim(bottom=-180000, top=180000)
plt.grid(False)
axis_rstyle(
    x_ticks=[0, 4500, 500],
    y_ticks=[-150000, 150000, 50000])
plt.show()

savefig('grlivarea_residuals', dir_save)

In [None]:
mean_squared_error(
    data_pred_actual.loc[data_pred_actual['grlivarea']<1800, 'actual_log'],
    data_pred_actual.loc[data_pred_actual['grlivarea']<1800, 'predicted_log'],
    squared=False
)

In [None]:
mean_squared_error(
    data_pred_actual.loc[data_pred_actual['grlivarea']>1800, 'actual_log'],
    data_pred_actual.loc[data_pred_actual['grlivarea']>1800, 'predicted_log'],
    squared=False
)

In [None]:
f = plt.figure(figsize=(11, 2.5))
data_pred_actual['residual_frac'].hist(bins=100)

In [None]:
test_normality(data_pred_actual['residual_frac'])

In [None]:
data_resid_frac = data_pred_actual.sort_values('grlivarea').copy()

In [None]:
data_resid_frac.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 2.5))

plt.scatter(
    x=data_resid_frac['grlivarea'],
    y=data_resid_frac['residual'].abs(),
    **scatter
)

plt.xlabel('Grlivarea')
plt.ylabel('Residuals (abs)')
plt.axvline(1875, 0.05, 0.9, lw=1, ls='--', color=palette[1], zorder=9)
plt.ylim(top=145000)
plt.grid(False)
axis_rstyle(
    x_ticks=[0, 4500, 500],
    y_ticks=[0, 140000, 20000])
plt.show()

savefig('residuals_grlivarea_scatter', dir_save)

In [None]:
fig, ax = plt.subplots(figsize=(8, 2.5))

plt.scatter(
    x=data_resid_frac['grlivarea'],
    y=data_resid_frac['residual_frac'].abs(),
    **scatter
)

plt.xlabel('Grlivarea')
plt.ylabel('Residuals_frac (abs)')
plt.axvline(1875, 0.05, 0.9, lw=1, ls='--', color=palette[1], zorder=9)
plt.ylim(bottom=-0.025, top=0.45)
plt.grid(False)
axis_rstyle(
    x_ticks=[0, 4500, 500],
    y_ticks=[0.0, 0.5, 0.1])
plt.show()

savefig('residuals_grlivarea_scatter_sorted', dir_save)

In [None]:
fig = plt.figure(figsize=(8, 2.5))

sns.scatterplot(
    data=data_pred_actual.sort_values('actual'),
    x=data_pred_actual['actual'],
    y=data_pred_actual['residual_frac']
)
plt.axline([0, 0], [1, 0], lw=1, ls='--', color=palette[1])
plt.xlabel('Actual')
plt.ylabel('Residuals frac')
plt.ylim(-0.5, 0.5)
plt.grid(False)
axis_rstyle(
    x_ticks=[0, 800000, 100000],
    y_ticks=[-0.5, 0.5, 0.25])
plt.show()

savefig('actual_residual_frac', dir_save)

In [None]:
fig = plt.figure(figsize=(8, 2.5))

sns.scatterplot(
    data=data_pred_actual.sort_values('grlivarea'),
    x=data_pred_actual['grlivarea'],
    y=data_pred_actual['residual_frac'],
    zorder=10
)
sns.regplot(
    x=data_pred_actual[data_pred_actual['grlivarea'] < 1900]['grlivarea'],
    y=data_pred_actual[data_pred_actual['grlivarea'] < 1900]['residual_frac'],
    lowess=True,
    scatter=False,
    line_kws={'lw': 1.2, 'color': palette[2], 'alpha': 0.5},
    label='grlivarea < 1900'
)
sns.regplot(
    x=data_pred_actual[data_pred_actual['grlivarea'] > 1900]['grlivarea'],
    y=data_pred_actual[data_pred_actual['grlivarea'] > 1900]['residual_frac'],
    lowess=True,
    scatter=False,
    line_kws={'lw': 1.2, 'color': palette[2], 'alpha': 0.5},
    label='grlivarea > 1900'
)
plt.axvline(1875, 0.1, 0.9, lw=1, ls='--', color=palette[1])
plt.xlabel('Grlivarea')
plt.ylabel('Residuals (frac)')
plt.ylim(-0.5, 0.5)
plt.grid(False)
axis_rstyle(
    x_ticks=[0, 4500, 500],
    y_ticks=[-0.5, 0.5, 0.25]
)
plt.show()
            
savefig('grlivarea_residuals_frac', dir_save)

### Save Session

In [None]:
save_session(dir_save)

### Execution time

In [None]:
print(f'Execution time: {stopwatch.stop(start)}')