## 3. Machine Learning

### 3.1 Importar Dados, fazer preprocessamento e dividir entre treino e teste

In [1]:
import pandas as pd
dados = pd.read_csv('https://raw.githubusercontent.com/cassiasamp/calculadora-de-imoveis-out-20/main/coleta-de-dados/dados_tratados_mercado_livre.csv')

In [2]:
dados.head()

Unnamed: 0,zonas,enderecos,precos,areas,quartos
0,sul,"Rua Doutor Ferreira Lopes, Vila Sofia, São Pau...",1100.0,90.0,3.0
1,sul,"Avenida Dos Ourives, Jardim São Savério, São P...",1000.0,63.0,3.0
2,sul,"Rua Doutor Nicolau Alberto Defina, Jardim Da S...",1224.0,68.0,2.0
3,sul,"Avenida Carlos Liviero, Vila Liviero, São Paul...",1300.0,48.0,2.0
4,sul,"Rua Armando Ramos Filho, Moinho Velho, São Pau...",830.0,40.0,1.0


In [3]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zonas      960 non-null    object 
 1   enderecos  960 non-null    object 
 2   precos     960 non-null    float64
 3   areas      960 non-null    float64
 4   quartos    958 non-null    float64
dtypes: float64(3), object(2)
memory usage: 37.6+ KB


In [4]:
X = dados.drop(columns=['precos', 'enderecos'])

In [5]:
X.isna().sum()

zonas      0
areas      0
quartos    2
dtype: int64

In [6]:
X.fillna(value=dados['quartos'].median(), inplace=True)

In [7]:
X = pd.get_dummies(X)

In [8]:
X.head()

Unnamed: 0,areas,quartos,zonas_leste,zonas_norte,zonas_oeste,zonas_sul
0,90.0,3.0,0,0,0,1
1,63.0,3.0,0,0,0,1
2,68.0,2.0,0,0,0,1
3,48.0,2.0,0,0,0,1
4,40.0,1.0,0,0,0,1


In [9]:
y = dados['precos']

In [10]:
y.isna().sum()

0

In [11]:
from sklearn.model_selection import train_test_split

SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = SEED,
                                                    test_size = 0.25)

### 3.2 Modelo "Baseline" (Ponto de Partida)

In [12]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.6329788306128329

In [13]:
from sklearn.dummy import DummyRegressor
reg = DummyRegressor(strategy='mean')
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.0021105807908479157

### 3.3 Fazer triagem entre diferentes estimadores

In [14]:
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LassoLars, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [15]:
reg_list = [RidgeCV(),
            LGBMRegressor(), 
            XGBRegressor(objective='reg:squarederror'),
            SVR(),
            KNeighborsRegressor(),
            RandomForestRegressor(),
            AdaBoostRegressor(),
            GradientBoostingRegressor(),
            MLPRegressor()
            ]

In [16]:
from sklearn.model_selection import cross_val_score
import numpy as np

for reg in reg_list:
    print(f'Treinando Modelo {reg.__class__.__name__}')
    reg.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

Treinando Modelo RidgeCV
R2 Score Train: 0.49188413265417735
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.6317473502556512
Treinando Modelo LGBMRegressor
R2 Score Train: 0.6470079027503934
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.5528611579994934
Treinando Modelo XGBRegressor
R2 Score Train: 0.7134827955049117
R2 Score Valid: 0.41 +- 0.10
R2 Score Test: 0.5451184805634111
Treinando Modelo SVR
R2 Score Train: -0.05041318871480094
R2 Score Valid: -0.06 +- 0.02
R2 Score Test: -0.07704036741649878
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.6255605862521028
R2 Score Valid: 0.38 +- 0.08
R2 Score Test: 0.4229946626464409
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.8492137573525922
R2 Score Valid: 0.35 +- 0.13
R2 Score Test: 0.41562347112671594
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.531461325012763
R2 Score Valid: 0.34 +- 0.20
R2 Score Test: 0.43205411486227485
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.7463084820407115
R2 Score Va



R2 Score Train: 0.2972573492038131
R2 Score Valid: 0.21 +- 0.18
R2 Score Test: 0.4950952957532034




In [17]:
# Bonus: Testando com todos os regressores do sklearn
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

relatorio = {'nome':[],
             'train_score':[],
             'cv_scores_mean':[],
             'test_score':[],
             'estimador':[]
             }

ignore_list = ['IsotonicRegression',
 'MultiOutputRegressor',
 'ElasticNet',
 'MultiTaskElasticNet',
 'MultiTaskElasticNetCV',
 'MultiTaskLasso',
 'MultiTaskLassoCV',
 'RadiusNeighborsRegressor',
 'RegressorChain',
 'StackingRegressor',
 'VotingRegressor']


In [18]:
estimators.extend(
    [('LGBMRegressor', LGBMRegressor),
     ('XGBRegressor', XGBRegressor)]
)

In [19]:
for name, RegressorClass in estimators:
  if name not in ignore_list:
    print(f'Treinando Modelo {name}')
    reg = RegressorClass()
    reg.fit(X_train, y_train)

    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)

    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

    relatorio['nome'].append(name)
    relatorio['train_score'].append(train_score)
    relatorio['cv_scores_mean'].append(np.mean(cv_scores))
    relatorio['test_score'].append(test_score)
    relatorio['estimador'].append(reg)

Treinando Modelo ARDRegression
R2 Score Train: 0.49137185924316024
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.6292897193646995
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.547276126856356
R2 Score Valid: 0.28 +- 0.19
R2 Score Test: 0.45928969453158885
Treinando Modelo BaggingRegressor
R2 Score Train: 0.827456235565963
R2 Score Valid: 0.28 +- 0.16
R2 Score Test: 0.3628702414805011
Treinando Modelo BayesianRidge
R2 Score Train: 0.49184425500567636
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.631590387202928
Treinando Modelo CCA




R2 Score Train: 0.3549579020964707
R2 Score Valid: 0.30 +- 0.17
R2 Score Test: 0.601791481490159
Treinando Modelo DecisionTreeRegressor
R2 Score Train: 0.9081632312421389
R2 Score Valid: 0.04 +- 0.28
R2 Score Test: 0.2034805012068954
Treinando Modelo DummyRegressor
R2 Score Train: 0.0
R2 Score Valid: -0.01 +- 0.00
R2 Score Test: -0.0021105807908479157
Treinando Modelo ElasticNetCV
R2 Score Train: 0.42904684510175806
R2 Score Valid: 0.39 +- 0.10
R2 Score Test: 0.5494983495130374
Treinando Modelo ExtraTreeRegressor
R2 Score Train: 0.9081632312421389
R2 Score Valid: -0.03 +- 0.27
R2 Score Test: 0.27694988520035024
Treinando Modelo ExtraTreesRegressor
R2 Score Train: 0.9081632312421389
R2 Score Valid: 0.24 +- 0.15
R2 Score Test: 0.3852514600560468
Treinando Modelo GaussianProcessRegressor
R2 Score Train: 0.9081632312400245
R2 Score Valid: -0.46 +- 0.09
R2 Score Test: -0.38443739344500555
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.7463084820407115
R2 Score Valid: 0.43 +- 0



R2 Score Train: 0.300735389345944
R2 Score Valid: 0.19 +- 0.21
R2 Score Test: 0.5006809886201347
Treinando Modelo NuSVR
R2 Score Train: -0.003654518977640686
R2 Score Valid: -0.01 +- 0.02
R2 Score Test: -0.023984441026680736
Treinando Modelo OrthogonalMatchingPursuit
R2 Score Train: 0.42907239963371213
R2 Score Valid: 0.39 +- 0.10
R2 Score Test: 0.5529606510303273
Treinando Modelo OrthogonalMatchingPursuitCV
R2 Score Train: 0.4920534981757311
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.6329788306128309
Treinando Modelo PLSCanonical
R2 Score Train: 0.101209198302176
R2 Score Valid: 0.01 +- 0.29
R2 Score Test: 0.3898849554005721
Treinando Modelo PLSRegression
R2 Score Train: 0.4852585742768618
R2 Score Valid: 0.45 +- 0.09
R2 Score Test: 0.6280377232721785
Treinando Modelo PassiveAggressiveRegressor
R2 Score Train: 0.4222860925648858
R2 Score Valid: -1.97 +- 2.46
R2 Score Test: 0.6054602147500643
Treinando Modelo RANSACRegressor




R2 Score Train: 0.3597143335070392
R2 Score Valid: 0.29 +- 0.09
R2 Score Test: 0.4624377577712254
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.8496818993943368
R2 Score Valid: 0.35 +- 0.13
R2 Score Test: 0.4386031837637796
Treinando Modelo Ridge
R2 Score Train: 0.4920516322835352
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.6328644605450089
Treinando Modelo RidgeCV
R2 Score Train: 0.49188413265417735
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.6317473502556512
Treinando Modelo SGDRegressor
R2 Score Train: -2.1706503854751236e+19
R2 Score Valid: -3669985190844706304.00 +- 5265212059882023936.00
R2 Score Test: -1.8744679420472234e+19
Treinando Modelo SVR
R2 Score Train: -0.05041318871480094
R2 Score Valid: -0.06 +- 0.02
R2 Score Test: -0.07704036741649878
Treinando Modelo TheilSenRegressor
R2 Score Train: 0.4709178371777769
R2 Score Valid: 0.44 +- 0.10
R2 Score Test: 0.6218263039435401
Treinando Modelo TransformedTargetRegressor
R2 Score Train: 0.4920534981757311
R2 Score 

In [20]:
relatorio = pd.DataFrame(relatorio).sort_values(by='cv_scores_mean', ascending=False)
relatorio.head(10)

Unnamed: 0,nome,train_score,cv_scores_mean,test_score,estimador
35,RidgeCV,0.491884,0.457308,0.631747,"RidgeCV(alphas=array([ 0.1, 1. , 10. ]), cv=N..."
34,Ridge,0.492052,0.457278,0.632864,"Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr..."
15,KernelRidge,0.491998,0.457272,0.634057,"KernelRidge(alpha=1, coef0=1, degree=3, gamma=..."
21,LassoLarsCV,0.492053,0.457242,0.632979,"LassoLarsCV(copy_X=True, cv=None, eps=2.220446..."
22,LassoLarsIC,0.492053,0.457242,0.632979,"LassoLarsIC(copy_X=True, criterion='aic', eps=..."
17,LarsCV,0.492053,0.457242,0.632979,"LarsCV(copy_X=True, cv=None, eps=2.22044604925..."
16,Lars,0.492053,0.457242,0.632979,"Lars(copy_X=True, eps=2.220446049250313e-16, f..."
39,TransformedTargetRegressor,0.492053,0.457242,0.632979,"TransformedTargetRegressor(check_inverse=True,..."
23,LinearRegression,0.492053,0.457242,0.632979,"LinearRegression(copy_X=True, fit_intercept=Tr..."
18,Lasso,0.492048,0.457221,0.63271,"Lasso(alpha=1.0, copy_X=True, fit_intercept=Tr..."


### 3.4 TODO: Calibrar melhores estimadores usando GridSearchCV

In [21]:
## Usar GridSearchCV para calibrar os melhores estimadores

### 3.5 (Opcional) Combinar os melhores estimadores usando Stacking

In [22]:
from sklearn.ensemble import StackingRegressor

top_3_regs = relatorio[['nome', 'estimador']].values[:3]

reg = StackingRegressor(
    estimators=top_3_regs
)

reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"R2 Score Train: {train_score}")
print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
print(f"R2 Score Test: {test_score}")
print('='*80)

R2 Score Train: 0.4919282169796338
R2 Score Valid: 0.46 +- 0.09
R2 Score Test: 0.6324865723682604


In [23]:
reg

StackingRegressor(cv=None,
                  estimators=array([['RidgeCV',
        RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)],
       ['Ridge',
        Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)],
       ['KernelRidge',
        KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)]], dtype=object),
                  final_estimator=None, n_jobs=None, passthrough=False,
                  verbose=0)

## 4. Exportar Modelo

In [31]:
import pickle
pickle.dump(reg, open('regressor.pkl', 'wb'), protocol=4)

In [25]:
ls

regressor.pkl  [0m[01;34msample_data[0m/


In [26]:
%reset -f

In [32]:
import pickle
reg = pickle.load(open('regressor.pkl', 'rb'))

In [33]:
reg

StackingRegressor(cv=None,
                  estimators=array([['RidgeCV',
        RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)],
       ['Ridge',
        Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)],
       ['KernelRidge',
        KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)]], dtype=object),
                  final_estimator=None, n_jobs=None, passthrough=False,
                  verbose=0)

In [34]:
import numpy as np
np.expm1(reg.predict([[0, 0, 0, 1, np.log1p(2), np.log1p(120)]]))

  


array([inf])