In [None]:
# imports
import pandas as pd
import numpy as np
from pdpbox import pdp, info_plots
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import category_encoders as ce

In [None]:
# load in the dataset
df = pd.read_csv(r"C:\Users\Jonat\dat-07-28\DAT-07-28\ClassMaterial\Unit3\data\iowa_train2.csv")

In [None]:
# fill in missing values
missing_cols_query   = df.isnull().sum() > 0
missing_cols_num     = df.loc[:, missing_cols_query].select_dtypes(include=np.number).columns.tolist()
missing_cols_cat     = df.loc[:, missing_cols_query].select_dtypes(include=np.object).columns.tolist()
df[missing_cols_num] = df[missing_cols_num].fillna(0)
df[missing_cols_cat] = df[missing_cols_cat].fillna('None')

In [None]:
# set up pipeline
ohe  = ce.OneHotEncoder(use_cat_names=True)
gbm  = GradientBoostingRegressor()

X    = ohe.fit_transform(df.drop('SalePrice', axis=1))
y    = df['SalePrice']

# create training & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1985)

In [None]:
# fit the model
gbm.fit(X_train, y_train)

In [None]:
# what was the impact of overall quality on the housing price?
fig, axes, summary_df = info_plots.actual_plot(
    model=gbm, X=X_train, 
    feature='OverallQual', feature_name='OverallQual', predict_kwds={}
)

In [None]:
# chart is being created from this variable
summary_df

In [None]:
# what was the additive impact of EACH unique value of OverallQual?  What was its marginal impact?
pdp_overall_qual = pdp.pdp_isolate(
    model=gbm, dataset=X_train, model_features=X.columns.tolist(), 
    feature='OverallQual', cust_grid_points=X_train['OverallQual'].unique().tolist()
)
fig, axes = pdp.pdp_plot(pdp_overall_qual, 'Overall Quality', plot_lines=True, frac_to_plot=100)

In [None]:
# where do these values come from? -- average values if you changed OverallQual to each particular value
pdp_overall_qual.pdp

In [None]:
# a small example
# copy the dataframe
X_train_copy = X_train.copy()
# change the value of OverallQual to 10 for EVERY value
X_train_copy['OverallQual'] = 10
# and now look at what we have
X_train_copy['OverallQual']

In [None]:
# and now look at our predictions
preds = gbm.predict(X_train_copy)
# and note the average
preds.mean()

In [None]:
# and what about the lines?
pdp_overall_qual.ice_lines

### And What About Neighborhoods?

In [None]:
# our column labels
ohe.get_feature_names()

In [None]:
# first, let's grab neighborhood columns
neighborhood_cols = [col for col in ohe.get_feature_names() if 'Neighborhood' in col]
neighborhood_cols

In [None]:
pdp_neighborhood.display_columns

In [None]:
# and what about categorical variables?
# what was the impact of neighborhood on the housing price? 
pdp_neighborhood = pdp.pdp_isolate(
    model=gbm, dataset=X_train, model_features=X.columns.tolist(), 
    feature=neighborhood_cols
)
fig, axes = pdp.pdp_plot(pdp_neighborhood, 'Neighborhood', plot_lines=True, frac_to_plot=100)
# this code is just for formatting -- getting the labels to format correctly
xtick_labels = [label.split('_')[-1] for label in neighborhood_cols]
axes['pdp_ax'].set_xticklabels(xtick_labels, rotation='vertical');

In [None]:
# and what about interactions?
fig, axes, summary_df = info_plots.actual_plot_interact(
    model=gbm, X=X_train, 
    features=['OverallQual', neighborhood_cols], 
    feature_names=['Housing Quality', 'Neighborhood']
)

axes['value_ax'].set_yticklabels(xtick_labels);

In [None]:
# and a slightly different look at the same problem
gbm_inter = pdp.pdp_interact(
    model=gbm, dataset=X_train, model_features=X_train.columns, 
    features=['OverallQual', 'GrLivArea']
)


fig, axes = pdp.pdp_interact_plot(
    gbm_inter, ['Overall Quality', 'LivingArea'], x_quantile=True, plot_type='grid', plot_pdp=True
)