In [None]:
from IPython.core.display import display
import pickle
import re
import numpy as np
import pandas as pd
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# terminal colors
WHITE = '\033[39m'
CYAN = '\033[36m'
GREEN = '\033[32m'
RED = '\033[31m'

# color pallete
colors = {
    'cyan': '#1696d2',
    'gray': '#5c5859',
    'black': '#000000',
    'yellow': '#fdbf11',
    'orange': '#ca5800',
    'magenta': '#af1f6b',
    'green': '#408941',
    'red': '#a4201d'
}

# disable warnings
import warnings
warnings.filterwarnings('ignore')

# pandas config
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# libraries version
print(f'Numpy: {np.__version__}')
print(f'Pandas: {pd.__version__}')
print(f'Sklearn: {sklearn.__version__}')
print(f'Matplotlib: {matplotlib.__version__}')
print(f'Seaborn: {sns.__version__}')

# Table of Contents  
- [Exploratory Data Analysis](#eda)  
    - [Overview](#overview)  
    - [Extracting Address Information](#address)  
    - [Target Distribuition](#target)  
- [Modeling](#modeling)  
    - [Baseline](#cycle1)  
    - [ElasticNet](#cycle2)  
    - [XGBoost Regressor](#cycle3)  
- [Deploy](#deploy)  

## <span id='eda' style='color:Gold'>Exploratory Data Analysis

### <span id='overview' style='color:#1696d2'>Overview

The file **usa_housing.csv** consists of a dataset that contains information about the price of homes in certain regions of the United States. A description of the columns of this dataframe is presented below: 
  
- Avg. Area Income: Average income of residents where the house is located.  
- Avg. Area House Age: Average age of the houses in the same city.  
- Avg. Area Number of Rooms: Average number of rooms for houses in the same city.  
- Avg. Area Number of Bedrooms: Average number of bedrooms for houses in the same city.   
- Area Population: Population of the city where the house is located. 
- Price: House selling price.  
- Address: Address of the house.  

In [None]:
df = pd.read_csv('https://s3-sa-east-1.amazonaws.com/lcpi/7cf57d48-ac3d-4748-9d81-5b4d6677fcff.csv')
df.head()

In [None]:
df.info()

### <span id='address' style='color:#1696d2'>Extracting data from Address

In [None]:
pd.options.display.max_colwidth = 65
string_sample = df.Address.sample(random_state=42).to_string(index=False)
df.Address.sample(5, random_state=1)

In [None]:
print(string_sample)
city_sample  = ''.join( re.sub('[^a-zA-Z]+',
                                ' ',
                                string_sample.split('\\n')[-1]
                            ).split()[:-1] )
state_sample = re.sub('[^a-zA-Z]+', ' ', string_sample.split('\n')[-1]).split()[-1]
city_sample, state_sample

In [None]:
df['City'] = df.Address.apply( lambda address: ''.join(re.sub('[^a-zA-Z]+', ' ', address.split('\n')[-1]).split()[:-1]) )
df['State'] = df.Address.apply( lambda address: re.sub('[^a-zA-Z]+', ' ', address.split('\n')[-1]).split()[-1] )
df.head()

### <span id='target' style='color:#1696d2'>Target Variable - Price

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,6))
sns.boxplot(y=df.Price, ax=ax[0])
sns.histplot(df.Price, ax=ax[1], color=colors['gray'])
plt.suptitle('Price Distribuition', fontsize=18);

In [None]:
sns.heatmap(df.corr(), cmap='coolwarm', annot=True, mask=np.triu(df.corr()))
plt.title('Data correlation', size=18, pad=20, loc='left');

In [None]:
sns.pairplot(df)

## <span id='modeling' style='color:Gold'>Modeling

#### Setup data

In [None]:
# define target and features (exclude categorical features for the first cycle)
X = df.drop(['Price', 'Address', 'City', 'State'], axis=1)
y = df['Price']

# split into traintest data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### <span id='cycle1' style='color:#1696d2'>First Cycle - Baseline model

#### Statsmodels OLS

In [None]:
import statsmodels.api as sm

X_train_const = sm.add_constant(X_train)
sm_model = sm.OLS(y_train, X_train_const, hasconst = True).fit()
sm_model.summary()

#### Sklearn Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

reg = LinearRegression().fit(X_train, y_train)

print(f'Train r2_score: {metrics.r2_score(y_train, reg.predict(X_train)):.3f}')
print(f'Test r2_score: {metrics.r2_score(y_test, reg.predict(X_test)):.3f}')

### <span id='cycle2' style='color:#1696d2'>Second Cycle - ElasticNet

In [None]:
# sklearn libraries import
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet

# pre-processing and estimator
pipeline = Pipeline([
                    ('std', StandardScaler()),
                    ('regr', ElasticNet(random_state=42))
])

# gridsearch parameters
param_grid = {
    'regr__alpha' : np.linspace(0.5,10,20),
    'regr__l1_ratio' : np.linspace(0,1,11),
    'regr__max_iter' : [1000, 2000, 3000],
    'regr__fit_intercept' : [True, False]
}

# metrics to evaluate
metrics = 'neg_mean_absolute_error'

# cross validation method
splitter = KFold(n_splits=5, shuffle=True, random_state=42)

# create grid
regr_grid = GridSearchCV(
                        estimator=pipeline,
                        param_grid=param_grid,
                        scoring=metrics,
                        cv=splitter,
                        verbose=10
)

# fit models
regr_grid.fit(X_train, y_train)

In [None]:
pd.DataFrame(regr_grid.cv_results_).sort_values('rank_test_score').head()

In [None]:
regr_grid.best_estimator_

In [None]:
# train model with the best parameters
regr_model = Pipeline([
                    ('std', StandardScaler()),
                    ('regr', ElasticNet(alpha=1.5, l1_ratio=1.0, random_state=42))
                    ]).fit(X_train, y_train)

# make predictions
y_train_pred = regr_model.predict(X_train)
y_test_pred = regr_model.predict(X_test)

#### Metrics

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

display_metrics = lambda metric, y, y_pred: print(f'{CYAN}{metric.__name__}{WHITE}: {metric(y, y_pred):.3f}')

print('Train metrics:')
for metric in [r2_score, mean_absolute_error, mean_squared_error]:
    display_metrics(metric, y_train, y_train_pred)

print('Test metrics:')
for metric in [r2_score, mean_absolute_error, mean_squared_error]:
    display_metrics(metric, y_test, y_test_pred)

In [None]:
print(f'Mean Absolute Error is {mean_absolute_error(y_test, y_test_pred) / y.mean() * 100:.2f}% of price mean.')

In [None]:
plt.figure(figsize=(12,6))
sns.regplot(x=y_test, y=y_test_pred, marker='.', line_kws={"color": "black"})
plt.xlabel('true price')
plt.ylabel('predict price')
plt.title('Relational plot between predict and true values', size=18, pad=20, loc='left');

In [None]:
resids = y_test_pred - y_test
fig, ax  = plt.subplots(1,2, figsize=(14,6))

sns.scatterplot(x=y_test_pred, y=resids, s=20, ax=ax[0])
ax[0].ticklabel_format(axis='Y', style='sci', scilimits=(0,0))
ax[0].set_xlabel('Predict target')
ax[0].set_ylabel('Resids')

sns.distplot(x=resids, ax=ax[1])
ax[1].ticklabel_format(axis='X', style='sci', scilimits=(0,0))

plt.suptitle('Residual Error Distribuition', size=18);

#### Interpreting Results

In [None]:
np.set_printoptions(suppress=True)
coefs_df = pd.DataFrame(
            data={
                'Original': np.append(
                                    '?', 
                                    (regr_model.named_steps['regr'].coef_ / regr_model.named_steps['std'].scale_).round(2)
                            ),
                'Scaled' : np.append(
                                    regr_model.named_steps['regr'].intercept_.round(2), 
                                    regr_model.named_steps['regr'].coef_)
                }, 
            index=['Intercept'] + X.columns.tolist()
        )
coefs_df

<span style='font-size:1.5em'>$b0 + \sum{b_i X_i} = \tilde{b}_0 - \sum{ \frac{\tilde{b_i}\mu_i}{\sigma_i}} + \sum{\frac{\tilde{b_i}}{\sigma_i} X_i}$  
<br>
<span style='color:orange'>coefficients:</span> <span style='font-size:1em'> $b_i = \frac{\tilde{b_i}}{\sigma_i}$
<br>
<span style='color:orange'>intercept:</span> <span style='font-size:1em'> $b_0 = \tilde{b_0} - \sum{ \frac{\tilde{b_i}\mu_i}{\sigma_i}}$

In [None]:
b0_til = regr_model.named_steps['regr'].intercept_  # scaled intercept
bi_til = regr_model.named_steps['regr'].coef_       # scaled coefs
mu_i = X_train.mean()                               # train dataset mean
sigma_i = regr_model.named_steps['std'].scale_      # std used in StandardScaler

b0 = b0_til - sum( (bi_til * mu_i) / sigma_i )      # unscaled intercept value

print(f'{CYAN}Statsmodels Intercept:{WHITE} {sm_model.params["const"]:.3f} | {CYAN}r2_score:{WHITE} {sm_model.rsquared}')
print(f'{CYAN}Sklearn Intercept:{WHITE} {b0:.3f} | {CYAN}r2_score:{WHITE} {r2_score(y_train, y_train_pred)}') # score from train dataset to match sm_model
print(f'{CYAN}Diff{WHITE} = {sm_model.params["const"] - b0}')

In [None]:
sns.barplot(x=coefs_df['Scaled'], y=coefs_df.index)
plt.tight_layout()

- **Intercept**: With every params = 0, house price is $ 1,229,576.99  (or 2,635,057.82 ? )
- **Avg. Area Income**:
- **Avg. Area House Age**:
- **Avg. Area Number of Rooms**:
- **Avg. Area Number of Bedrooms**:
- **Area Population**:

#### Saving Model

In [None]:
# save model in pickle format
with open('pickle/house_pricing_regr_model', 'wb') as file:
    pickle.dump(regr_model, file)

### <span id='cycle3' style='color:#1696d2'>Third Cycle - XGBoost

In [None]:
from xgboost import XGBRegressor

xgb_pipeline = Pipeline([
                    ('std', StandardScaler()),
                    ('xgb', XGBRegressor(random_state=42))
                ])

# gridsearch parameters
param_grid = {
    'xgb__n_estimators': [2,10,30,50,75,100,200],
    'xgb__max_depth': range(1,6),
    'xgb__reg_alpha' : np.linspace(0.5,10,10),
    'xgb__reg_lambda' : np.linspace(0.5,10,10)
}

# metrics to evaluate
metrics = 'neg_mean_absolute_error'

# cross validation method
splitter = KFold(n_splits=5, shuffle=True, random_state=42)

# create grid
xgb_grid = GridSearchCV(
                        estimator=xgb_pipeline,
                        param_grid=param_grid,
                        scoring=metrics,
                        cv=splitter,
                        verbose=10
            )

xgb_grid.fit(X_train, y_train)

In [None]:
xgb_grid.best_params_

In [None]:
y_train_pred_xgb = xgb_grid.predict(X_train)
y_test_pred_xgb = xgb_grid.predict(X_test)

In [None]:
print('Train metrics:')
for metric in [r2_score, mean_absolute_error, mean_squared_error]:
    display_metrics(metric, y_train, y_train_pred_xgb)

print('Test metrics:')
for metric in [r2_score, mean_absolute_error, mean_squared_error]:
    display_metrics(metric, y_test, y_test_pred_xgb)

In [None]:
# save model in pickle format
with open('pickle/house_pricing_xgb_model', 'wb') as file:
    pickle.dump(xgb_grid, file)

**Conclusion**: despite using a more complex regression model, score from the *Test* dataset was slightly lower.

## <span id='deploy' style='color:Gold'>Deploy

In [None]:
# load model data in pickle format
with open('pickle/house_pricing_regr_model', 'rb') as file:
    model = pickle.load(file)

In [None]:
income = 68200
house_age = 6
n_rooms = 7
n_bedrooms = 4
population = 30000

input = [[income, house_age, n_rooms, n_bedrooms, population]]

print(f'$ {model.predict(input)[0]:.2f}')

In [None]:
# Importando bibliotecas
from ipywidgets import widgets, HBox, VBox

# Criando os controles do formulário
income = widgets.Text(description='Income')
house_age = widgets.Text(description='House Age')
n_rooms = widgets.Text(description='Number of Rooms')
n_bedrooms = widgets.Text(description='Number of Bedrooms')
population = widgets.Text(description='Population?')

button = widgets.Button(description='Simulate')

# Posicionando os controles
left = VBox([income, house_age, n_rooms])
right = VBox([n_bedrooms, population])
inputs = HBox([left, right])

# Função de simulação
def simulator(sender):
    input=[[
            float(income.value if income.value else 0), 
            float(house_age.value if house_age.value else 0), 
            float(n_rooms.value if n_rooms.value else 0), 
            float(n_bedrooms.value if n_bedrooms.value else 0), 
            float(population.value if population.value else 0), 
             ]]
    print(f'$ {model.predict(input)[0]:.2f}')

# Atribuindo a função 'simulador' ao evento click do botão
button.on_click(simulator) 