In [None]:
# run stopwatch
from tools import Stopwatch
stopwatch = Stopwatch()
start = stopwatch.start()

### Load libraries, functions, palette, theme

In [None]:
%run _libraries.ipynb

In [None]:
%run _functions.ipynb

In [None]:
dir_save = 'Section3-Feature-selection-and-Preprocessing'

# Section III. Feature selection and Preprocessing

## Load Saved Section if exists

## Load Data

In [None]:
# dicts
datasets_dict = loadit(
    'datasets_dict', dir='Section2-Explore-and-Clean')
features_dict = loadit(
    'features_dict', dir='Section1-Overview-and-Base-model')
transform_dict = loadit(
    'transform_dict', dir='Section1-Overview-and-Base-model')
groups_dict = loadit(
    'groups_dict', dir='Section2-Explore-and-Clean')
simulation_datasets_dict = loadit(
    'simulation_datasets_dict', dir='Section2-Explore-and-Clean')
estimators_dict = loadit(
    'estimators_dict', dir='Section2-Explore-and-Clean')
evaluation_dict = loadit(
    'evaluation_dict', dir='Section2-Explore-and-Clean')
# datasets
train = datasets_dict['train'].copy()
train_raw = datasets_dict['train'].copy()
# features
features = features_dict['features']
numeric = features_dict['numeric']
ordinal = features_dict['ordinal']
categorical = features_dict['categorical']
categorical_transform = features_dict['categorical_transform']
target = features_dict['target']

## Feature Selection

### Feature Engineering

Explore features groups and try to find ways to create new features.

#### Square Feet Features

**New Features**:
1. Floors square mean: mean of 'first_flrsf' and 'second_flrsf' (with weight: 0.5).  
   *Suggestion*: people interest in first floor square more than in second (coeff: 0.5). So, mean of floors' square with weight would work better, than living area square. 

In [None]:
features_square = groups_dict['features_square']

In [None]:
train_square = create_df_group(features_square)

In [None]:
train_square.head()

In [None]:
train_square.shape

In [None]:
train_square['flrsfmean'] = \
    (train_square['first_flrsf'] + 0.7*train_square['second_flrsf']) / 1.7

In [None]:
correlation_w_target(train_square, target)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(11,3))
fig.suptitle(
    'Live area and Floors mean square vs Price', x=0.2535, fontsize=10
)
sns.regplot(
    x=train_square['grlivarea'], y=train_square['price'],
    scatter_kws={'s': 3}, ax=ax[0]
)
ax[0].set_xlabel('Grlivarea')
ax[0].set_ylabel('Price')
axis_rstyle(
    ax=ax[0],
    x_ticks=[0, 5000, 1000],
    y_ticks=[10.5, 14.0, 0.5]
)

sns.regplot(
    x=train_square['flrsfmean'], y=train_square['price'],
    scatter_kws={'s': 3}, ax=ax[1]
)
ax[1].set_xlabel('Flrsfmean')
ax[1].set_ylabel(None)
axis_rstyle(
    ax=ax[1],
    x_ticks=[0, 5000, 1000],
    y_ticks=[10.5, 14.0, 0.5]
)

plt.show()

savefig('flrsfmean', dir_save)

In [None]:
g_data = pd.concat([train, train_square['flrsfmean']], axis=1)
fig = sns.FacetGrid(g_data, col='neighborhood')
fig.fig.set_tight_layout(True)
fig.map(sns.scatterplot, 'flrsfmean', target, **scatter);
plt.show()

savefig('neighborhood', dir_save)

#### Rooms Features

**New features**:
1. Bedroom realtive size: bedrooms number / living area
2. Kitchen realtive size: kitchens number / living area
3. Bedrooms as fraction of rooms number: bedrooms number / rooms number
4. Kitchen as fraction of rooms number: kitchen number / rooms number
5. Bathrooms as fraction of bedrooms number: full bathrooms number / bedrooms number

In [None]:
features_rooms = groups_dict['features_rooms']

In [None]:
train_rooms = create_df_group(features_rooms)

In [None]:
train_rooms.head()

In [None]:
train_rooms['bedroomsize'] = \
    train_rooms['bedroomabvgr'] / train['grlivarea']

In [None]:
train_rooms['kitchensize'] = \
    train_rooms['kitchenabvgr'] / train['grlivarea']

In [None]:
train_rooms['bedroomfracrms'] = \
    train_rooms['bedroomabvgr'] / train_rooms['totrmsabvgrd']

In [None]:
train_rooms['kitchenfracrms'] = \
    train_rooms['kitchenabvgr'] / train_rooms['totrmsabvgrd']

In [None]:
train_rooms['bathsfracbedr'] = \
    train_rooms['fullbath'] / train_rooms['bedroomabvgr']

In [None]:
for f in ['bedroomfracrms', 'kitchenfracrms', 'bathsfracbedr']:
    train_rooms[f] = np.round(train_rooms[f], 4)

In [None]:
correlation_w_target(train_rooms, target)

In [None]:
fig = plt.figure(figsize=(11,3))
fig.suptitle('Bedroomsize and Kitchensize vs Price', x=0.241, fontsize=10)
plt.subplot(1,2,1)
sns.scatterplot(x=train_rooms['bedroomsize'], y=train[target], **scatter)
plt.xlabel(str.capitalize('Bedroomsize'))
plt.ylabel(str.capitalize('Price'))
axis_rstyle(
    x_ticks=[0, 0.005, 0.0005],
    y_ticks=[10.5, 14.0, 0.5]
)

plt.subplot(1,2,2)
sns.scatterplot(x=train_rooms['kitchensize'], y=train[target], **scatter)
plt.xlabel(str.capitalize('Kitchensize'))
plt.ylabel(None)
axis_rstyle(
    x_ticks=[0, 0.005, 0.0005],
    y_ticks=[10.5, 14.0, 0.5]
)

plt.show()

savefig('bedroom_kitchen', dir_save)

In [None]:
sns.scatterplot(x=train_rooms['bedroomfracrms'], y=train[target]);

In [None]:
sns.pointplot(
    x=train_rooms['bedroomfracrms'], y=train[target], **point);

In [None]:
sns.scatterplot(x=train_rooms['kitchenfracrms'], y=train[target]);

In [None]:
sns.pointplot(
    x=train_rooms['kitchenfracrms'], y=train[target], **point);

In [None]:
sns.pointplot(
    x=train_rooms['bathsfracbedr'], y=train[target], **point);

#### Year Features

**New features**:
1. House age at the moment of sale: year sold - year built
2. Modernization age: year sold - modernization year

In [None]:
features_year = groups_dict['features_year']
train_year = create_df_group(features_year)

In [None]:
train_year.head()

In [None]:
train_year['houseage'] = train_year['yrsold'] - train_year['yearbuilt']
train_year['modadd'] = \
    (train_year['yearremodadd']!=train_year['yearbuilt']).astype(int)
# if modernization exist, age of modernization equals (year sold - year modernization)
# if modernization doesn't exist, age of modernization equals (year sold - year built)
train_year['modage'] = \
    np.where(train_year['modadd']==1,
             train_year['yrsold'] - train_year['yearremodadd'],
             train_year['yrsold'] - train_year['yearbuilt'])

In [None]:
correlation_w_target(train_year, target)

#### Create New Features in Train Dataset

In [None]:
train['flrsfmean'] = \
    (train['first_flrsf'] + 0.7*train['second_flrsf']) / 1.7

In [None]:
train['bedroomsize'] = train['bedroomabvgr'] / train['grlivarea']

In [None]:
train['kitchensize'] = train['kitchenabvgr'] / train['grlivarea']

In [None]:
train['bedroomfracrms'] = train['bedroomabvgr'] / train['totrmsabvgrd']

# max value of 'bedroomfracrms' except inf
loc_value = (~np.isinf(train['bedroomfracrms']), 'bedroomfracrms')
value = train.loc[loc_value].max()

# fill inf values with max value
loc_r = np.isinf(train['bedroomfracrms'])
train.loc[loc_r, 'bedroomfracrms'] = value

In [None]:
train['kitchenfracrms'] = train['kitchenabvgr'] / train['totrmsabvgrd']

# max value of 'kitchenfracrms' except inf
loc_value = (~np.isinf(train['kitchenfracrms']), 'kitchenfracrms')
value = train.loc[loc_value].max()

# fill inf values with max value
loc_r = np.isinf(train['kitchenfracrms'])
train.loc[loc_r, 'kitchenfracrms'] = value

# fill NaN values by 0
train['kitchenfracrms'] = train['kitchenfracrms'].fillna(0)

In [None]:
train['bathsfracbedr'] = train['fullbath'] / train['bedroomabvgr']

# max value of 'bathsfracbedr' except inf
loc_value = (~np.isinf(train['bathsfracbedr']), 'bathsfracbedr')
value = train.loc[loc_value].max()

# fill inf values with max value
loc_r = np.isinf(train['bathsfracbedr'])
train.loc[loc_r, 'bathsfracbedr'] = value

# fill NaN values by 0
train['bathsfracbedr'] = train['bathsfracbedr'].fillna(0)

In [None]:
for f in ['bedroomfracrms', 'kitchenfracrms', 'bathsfracbedr']:
    train[f] = np.round(train[f], 4)

In [None]:
# if 'yearremodadd' != 'yearbuilt' -> 1, else -> 0
train['yearremodadd_exst'] = \
    (train['yearremodadd']!=train['yearbuilt']).astype(int)

In [None]:
train['yearremodadd_exst'].unique()

In [None]:
features_as_factor = ['yearremodadd_exst']

features_to_factor = [
    'masvnrarea', 'bsmtfinsf_first', 'bsmtfinsf_second', 
    'totalbsmtsf', 'bsmtunfsf', 'lowqualfinsf', 'second_flrsf', 'garagearea',
    'wooddecksf', 'openporchsf', 'enclosedporch', 'three_ssnporch',
    'screenporch', 'poolarea', 'miscval'
]

for feature in features_to_factor:
    new_feature_name = feature + '_exst'
    train[new_feature_name] = (train[feature]!=0).astype(int)
    features_as_factor.append(new_feature_name)

In [None]:
print(features_as_factor)

In [None]:
cond = (train['yearremodadd_exst']==1)
outcome1 = (train['yrsold'] - train['yearremodadd'])
outcome0 = (train['yrsold'] - train['yearbuilt'])

train['modage'] = np.where(cond, outcome1, outcome0)

In [None]:
train['houseage'] = train['yrsold'] - train['yearbuilt']
train['garageage'] = train['yrsold'] - train['garageyrblt']

In [None]:
# make target column the last one
col_to_move = train.pop(target)
train.insert(len(train.columns), target, col_to_move)

In [None]:
# check for NaNs
assert ~train.isna().values.any()

In [None]:
# check for INF
assert ~np.isinf(train[numeric]).values.any()

#### Add New Features to Features Lists

In [None]:
# create lists with 'raw' features
features_raw = features.copy()
numeric_raw = numeric.copy()
categorical_raw = categorical.copy()

features_linear = features.copy()
numeric_linear = numeric.copy()
categorical_linear = categorical.copy()

In [None]:
len(features_raw)

In [None]:
assert len(features_raw) == (len(numeric_raw)
                             + len(ordinal)
                             + len(categorical_raw))

In [None]:
new_features = [
    'flrsfmean', 'bedroomsize', 'kitchensize', 'bedroomfracrms',
    'kitchenfracrms', 'bathsfracbedr', 'houseage', 'modage', 'garageage'
]

In [None]:
# add new features ti features lists
features = features + new_features + features_as_factor
# features = features + new_features
numeric = numeric + new_features
factor =  features_as_factor

In [None]:
len(features)

In [None]:
assert len(features) == (len(numeric)
                         + len(ordinal)
                         + len(categorical)
                         + len(factor))

### Logarithm Features 

Adding constant C=1

In [None]:
train.head()

In [None]:
features_log = numeric
features_log_drop = [
    'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
    'fireplaces', 'garagecars', 'miscval', 'mosold', 'yrsold',
    'bedroomsize', 'kitchensize', 'bedroomfracrms', 'kitchenfracrms',
    'bathsfracbedr', 'houseage', 'modage', 'garageage', 'yearbuilt',
]
features_log = [i for i in features_log if i not in features_log_drop]
features_log = [i for i in features_log if i not in ordinal]

In [None]:
log_processor = LogCpTransformer(
    variables=features_log,
    base='e',
    C=1
)

In [None]:
log_processor.fit(train[features_log])

In [None]:
train['garagearea'].hist(bins=50);

In [None]:
train[features_log] = log_processor.transform(train[features_log])

for feature in features_log:
    train = train.rename(columns={feature: 'lg_'+feature})
    # rename elements in features list: add 'lg_'
    dct = {feature: 'lg_'+feature}
    features = [dct.get(n, n) for n in features]
    features_linear = [dct.get(n, n) for n in features_linear]
    features_raw = [dct.get(n, n) for n in features_raw]
    numeric = [dct.get(n, n) for n in numeric]
    numeric_linear = [dct.get(n, n) for n in numeric_linear]
    numeric_raw = [dct.get(n, n) for n in numeric_raw]

In [None]:
train['lg_garagearea'].hist(bins=50);

### Transformaton Pipeline Check

In [None]:
features_creator = FeaturesCreator(factors=True)

In [None]:
features_logger = FeaturesLogger(features_log=features_log)

In [None]:
pipeline = Pipeline(steps=[
    ('features_create', features_creator),
    ('features_log', features_logger)
])

In [None]:
train_transformed = pipeline.transform(train_raw)

In [None]:
col = train_transformed.pop('price')
train_transformed.insert(len(train_transformed.columns), 'price', col)

In [None]:
assert train_transformed.equals(train)

In [None]:
# display rows with differences
train[~train.apply(tuple,1).isin(train_transformed.apply(tuple,1))]

### Correlation Matrix with New Features

In [None]:
corr_df = train[numeric + ordinal + [target]].corr()

In [None]:
fig = plot_corr_matrix(
    data=corr_df, target=target, num_features=10,
    width=0.75, height=0.25, annot=6.5, labelsize=6.5,
    linecolor=theme, full=True, abs_results=True,
    df=False, df_limit=None
)
savefig('corr_matrix_new', dir_save, dpi=125)

## Preprocessing

In [None]:
train.head()

In [None]:
train_raw = train.copy()

### Create Train dataset for further GridSearch
Transformations: only log, no encoding and scaling.

In [None]:
train_cv = train.copy()

In [None]:
train_cv.head()

### Encoding

In [None]:
encoder = Encoder(features_transform=categorical_transform)

In [None]:
encoder.fit(train, train[target])

In [None]:
train = encoder.transform(train)

In [None]:
train.head()

### Scaling

In [None]:
scaler = Scaler(features_transform=features)

In [None]:
scaler.fit(train[features])

In [None]:
train[features] = scaler.transform(train[features])

### Transformation Pipeline Check

In [None]:
pipeline = Pipeline(steps=[
    ('encoder', encoder),
    ('scaler', scaler)
])

In [None]:
train_transformed = pipeline.transform(train_raw)

In [None]:
assert train_transformed.equals(train)

In [None]:
# display rows with differences
train[~train.apply(tuple,1).isin(train_transformed.apply(tuple,1))]

In [None]:
train.head()

## Intermediate Model #2

In [None]:
train_im = train_cv.copy()

In [None]:
train_im.head()

In [None]:
train_im.shape

### Pipeline For GridSearch

In [None]:
encoder_im = OrdinalEncoder(
    encoding_method='ordered',
    variables=categorical_transform,
    missing_values='ignore',
    unseen='encode'
)

In [None]:
lgb1 = LGBMRegressor()

In [None]:
lgb_pipeline1 = Pipeline(steps=[
    ('encoder', encoder_im),
    ('estimator', lgb1)
])

### GridSearch Iteration 1

In [None]:
params1 = {
    'estimator__n_estimators': [25, 50, 75],
    'estimator__max_depth': [3, 6, 9],
    'estimator__num_leaves': [5, 38, 300],
    'estimator__learning_rate': [0.1, 0.3],
    'estimator__random_state': [seed],
    'estimator__verbose': [-1]
}

In [None]:
n_folds1 = 20

In [None]:
cv1 = GridSearchCV(
    estimator=lgb_pipeline1, 
    param_grid=params1,
    scoring='neg_root_mean_squared_error',
    cv=n_folds1
)

In [None]:
st = stopwatch.start()
cv1.fit(train_im[features], train_im[target])
print(f'Execution time: {stopwatch.stop(st)}')

In [None]:
models1 = []

range1 = range(len(cv1.cv_results_['params']))
for i in range1:
    models1.append(list(cv1.cv_results_['params'][i].values()))

range2 = range(len(models1))
for j in range2:
    models1[j] = ' / '.join(str(i) for i in models1[j])

In [None]:
keys1 = []
for i in range(0, n_folds1):
    keys1.append(
        'split' + str(i) + '_test_score'
    )

In [None]:
results1 = {key: cv1.cv_results_[key] for key in keys1}

In [None]:
results_df1 = pd.DataFrame(results1)

In [None]:
results_df1 = results_df1.T

In [None]:
idx_opt1 = np.argmax(cv1.cv_results_['mean_test_score'])

In [None]:
results_df1 = pd.melt(results_df1)

In [None]:
results_df1['opt'] = results_df1['variable']==idx_opt1
results_df1['opt'] = results_df1['opt'].astype(int)

In [None]:
plt.figure(figsize=(20,4))

ax = sns.pointplot(
    results_df1,
    x='variable',
    y='value',
    hue='opt',
    markersize=3,
    linestyle='none',
    capsize=0.2,
    err_kws={'lw': 1},
    palette=[palette[0], palette[1]]
)
    
ax.set_xticks(ticks=list(set(results_df1['variable'])), labels=models1)
plt.xticks(rotation=90)
plt.xlabel(None)
plt.ylabel(None)
ax.get_legend().remove()

plt.show()

In [None]:
round(cv1.cv_results_['mean_test_score'][idx_opt1], 7)

In [None]:
best_params1 = cv1.cv_results_['params'][idx_opt1]

In [None]:
best_params1

### GridSearch Iteration 2

In [None]:
lgb2 = LGBMRegressor()

In [None]:
lgb_pipeline2 = Pipeline(steps=[
    ('encoder', encoder_im),
    ('estimator', lgb2)
])

In [None]:
learning_rate = [best_params1['estimator__learning_rate']]

max_depth_val = best_params1['estimator__max_depth']
max_depth = list(np.arange(max_depth_val-2, max_depth_val+3))

n_estimators_val = best_params1['estimator__n_estimators']
n_estimators = [n_estimators_val-5, n_estimators_val, n_estimators_val+5]

num_leaves_val = best_params1['estimator__num_leaves']
num_leaves = [num_leaves_val-15, num_leaves_val, num_leaves_val+15]

In [None]:
params2 = {
    'estimator__learning_rate': learning_rate,
    'estimator__max_depth': max_depth,
    'estimator__n_estimators': n_estimators,
    'estimator__num_leaves': num_leaves,
    'estimator__random_state': [seed],
    'estimator__verbose': [-1]
}

In [None]:
params2

In [None]:
n_folds2 = 20

In [None]:
cv2 = GridSearchCV(
    estimator=lgb_pipeline2, 
    param_grid=params2,
    scoring='neg_root_mean_squared_error',
    cv=n_folds2
)

In [None]:
st = stopwatch.start()
cv2.fit(train_im[features], train_im[target])
print(f'Execution time: {stopwatch.stop(st)}')

In [None]:
models2 = []

range1 = range(len(cv2.cv_results_['params']))
for i in range1:
    models2.append(list(cv2.cv_results_['params'][i].values()))

range2 = range(len(models2))
for j in range2:
    models2[j] = ' / '.join(str(i) for i in models2[j])

In [None]:
keys2 = []
for i in range(0, n_folds2):
    keys2.append(
        'split' + str(i) + '_test_score'
    )

In [None]:
results2 = {key: cv2.cv_results_[key] for key in keys2}

In [None]:
results_df2 = pd.DataFrame(results2)

In [None]:
results_df2 = results_df2.T

In [None]:
idx_opt2 = np.argmax(cv2.cv_results_['mean_test_score'])

In [None]:
results_df2 = pd.melt(results_df2)

In [None]:
results_df2['opt'] = results_df2['variable']==idx_opt2
results_df2['opt'] = results_df2['opt'].astype(int)

In [None]:
plt.figure(figsize=(20,4))

ax = sns.pointplot(
    results_df2,
    x='variable',
    y='value',
    hue='opt',
    markersize=3,
    linestyle='none',
    capsize=0.2,
    err_kws={'lw': 1},
    palette=[palette[0], palette[1]]
)
    
ax.set_xticks(ticks=list(set(results_df2['variable'])), labels=models2)
plt.xticks(rotation=90)
plt.xlabel(None)
plt.ylabel(None)
ax.get_legend().remove()

plt.show()

In [None]:
rmse_train_opt = cv2.cv_results_['mean_test_score'][idx_opt2]

In [None]:
rmse_train_opt

In [None]:
rmse_train_opt.round(4)

In [None]:
params_opt = cv2.cv_results_['params'][idx_opt2]

In [None]:
params_opt

In [None]:
params = {}
for key in params_opt.keys():
    new_key = key.lstrip('estimator')
    new_key = new_key.lstrip('__')
    params[new_key] = params_opt[key]

In [None]:
params

In [None]:
lgb_opt = LGBMRegressor(**params)

In [None]:
train_im[features] = encoder_im.fit_transform(
    train_im[features], train_im[target])

In [None]:
train_im.head()

In [None]:
lgb_opt.fit(train_im[features], train_im[target])

In [None]:
feature_importance = pd.DataFrame({
    'Feature': lgb_opt.feature_name_,
    'Importance': lgb_opt.feature_importances_
})
feature_importance.sort_values('Importance', ascending=False, inplace=True)

In [None]:
data_plot = feature_importance.loc[feature_importance['Importance'] > 0, :]
data_plot = data_plot.sort_values('Importance', ascending=False)

In [None]:
data_plot.head()

In [None]:
values = data_plot['Importance'][:10]
labels = data_plot['Feature'][:10]

In [None]:
fig = plot_bar_horizontal(
    values=values,
    labels=labels,
    labelsize=9,
    labelcolor='0.3',
    labelweight='medium',
    figsize=(3, 2.5),
    kind='lol',
    width=0.5,
    s=6,
    x_lim_right=100
)
savefig('feature_importance', dir_save)

## Save Data

In [None]:
datasets_dict['train'] = train
datasets_dict['train_cv'] = train_cv

In [None]:
features_dict['features'] = features
features_dict['features_linear'] = features_linear
features_dict['features_raw'] = features_raw

features_dict['numeric'] = numeric
features_dict['numeric_linear'] = numeric_linear
features_dict['numeric_raw'] = numeric_raw

features_dict['categorical'] = categorical
features_dict['categorical_linear'] = categorical_linear
features_dict['categorical_raw'] = categorical_raw

features_dict['factor'] = factor

In [None]:
transform_dict['encoder'] = encoder
transform_dict['scaler'] = scaler

transform_dict['features_log'] = features_log
transform_dict['transformer_features_creator'] = features_creator
transform_dict['transformer_features_logger'] = features_logger

In [None]:
simulation_datasets_dict['train_iter2'] = train_im
simulation_datasets_dict['features_iter2'] = features

In [None]:
estimators_dict['lgb_iter2'] = lgb_opt

In [None]:
evaluation_dict['cv_iter2'] = cv2

In [None]:
saveit(datasets_dict, 'datasets_dict', dir_save)

In [None]:
saveit(features_dict, 'features_dict', dir_save)

In [None]:
saveit(transform_dict, 'transform_dict', dir_save)

In [None]:
saveit(simulation_datasets_dict, 'simulation_datasets_dict', dir_save)

In [None]:
saveit(estimators_dict, 'estimators_dict', dir_save)

In [None]:
saveit(evaluation_dict, 'evaluation_dict', dir_save)

In [None]:
saveit(params, 'params', dir_save)

In [None]:
saveit(feature_importance, 'feature_importance', dir_save)

### Save Session

In [None]:
save_session(dir_save)

### Execution time

In [None]:
print(f'Execution time: {stopwatch.stop(start)}')