## 3. Machine Learning

### 3.1 Importar Dados, fazer preprocessamento e dividir entre treino e teste

In [1]:
import pandas as pd
dados = pd.read_csv('https://raw.githubusercontent.com/cassiasamp/calculadora-de-imoveis-mar-21/main/manipulacao-de-dados/dados_tratados_ml.csv')

In [2]:
dados.head()

Unnamed: 0,aluguel,zona,area,quarto
0,1000.0,sul,48.0,2.0
1,1200.0,sul,67.0,3.0
2,1300.0,sul,65.0,2.0
3,1280.0,sul,65.0,2.0
4,1100.0,sul,50.0,2.0


In [3]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1152 entries, 0 to 1151
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   aluguel  1152 non-null   float64
 1   zona     1152 non-null   object 
 2   area     1152 non-null   float64
 3   quarto   1152 non-null   float64
dtypes: float64(3), object(1)
memory usage: 36.1+ KB


In [4]:
X = dados.drop(columns=['aluguel'])

In [5]:
X.isna().sum()

zona      0
area      0
quarto    0
dtype: int64

In [6]:
X = pd.get_dummies(X)

In [7]:
X.head()

Unnamed: 0,area,quarto,zona_leste,zona_norte,zona_oeste,zona_sul
0,48.0,2.0,0,0,0,1
1,67.0,3.0,0,0,0,1
2,65.0,2.0,0,0,0,1
3,65.0,2.0,0,0,0,1
4,50.0,2.0,0,0,0,1


In [8]:
y = dados['aluguel']

In [9]:
y.isna().sum()

0

In [10]:
from sklearn.model_selection import train_test_split

SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = SEED,
                                                    test_size = 0.25)

### 3.2 Modelo "Baseline" (Ponto de Partida)

In [11]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.267445717553804

In [12]:
from sklearn.dummy import DummyRegressor
reg = DummyRegressor(strategy='mean')
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.012722806117509977

### 3.3 Fazer triagem entre diferentes estimadores

In [17]:
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LassoLars, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [18]:
reg_list = [RidgeCV(),
            LGBMRegressor(), 
            XGBRegressor(objective='reg:squarederror'),
            SVR(),
            KNeighborsRegressor(),
            RandomForestRegressor(),
            AdaBoostRegressor(),
            GradientBoostingRegressor(),
            MLPRegressor()
            ]

In [19]:
from sklearn.model_selection import cross_val_score
import numpy as np

for reg in reg_list:
    print(f'Treinando Modelo {reg.__class__.__name__}')
    reg.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

Treinando Modelo RidgeCV
R2 Score Train: 0.4581210448683818
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2695055663797735
Treinando Modelo LGBMRegressor
R2 Score Train: 0.5934991352177392
R2 Score Valid: 0.38 +- 0.07
R2 Score Test: 0.4024894770956645
Treinando Modelo XGBRegressor
R2 Score Train: 0.8285811106579803
R2 Score Valid: 0.12 +- 0.19
R2 Score Test: 0.33734910110716687
Treinando Modelo SVR
R2 Score Train: -0.05290574612528798
R2 Score Valid: -0.06 +- 0.05
R2 Score Test: -0.01807220076769367
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.523835009863003
R2 Score Valid: 0.21 +- 0.15
R2 Score Test: 0.3747385483134712
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.7786625360132844
R2 Score Valid: 0.26 +- 0.12
R2 Score Test: 0.34970813836842096
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.35220915451113166
R2 Score Valid: 0.19 +- 0.19
R2 Score Test: 0.011079922024993527
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.6424235817962513
R2 Score 



R2 Score Train: 0.264366647265332
R2 Score Valid: 0.20 +- 0.18
R2 Score Test: -0.23785975583153052




In [20]:
# Bonus: Testando com todos os regressores do sklearn
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

relatorio = {'nome':[],
             'train_score':[],
             'cv_scores_mean':[],
             'test_score':[],
             'estimador':[]
             }

ignore_list = ['IsotonicRegression',
 'MultiOutputRegressor',
 'ElasticNet',
 'MultiTaskElasticNet',
 'MultiTaskElasticNetCV',
 'MultiTaskLasso',
 'MultiTaskLassoCV',
 'RadiusNeighborsRegressor',
 'RegressorChain',
 'StackingRegressor',
 'VotingRegressor']


In [21]:
estimators.extend(
    [('LGBMRegressor', LGBMRegressor),
     ('XGBRegressor', XGBRegressor)]
)

In [22]:
for name, RegressorClass in estimators:
  if name not in ignore_list:
    print(f'Treinando Modelo {name}')
    reg = RegressorClass()
    reg.fit(X_train, y_train)

    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)

    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

    relatorio['nome'].append(name)
    relatorio['train_score'].append(train_score)
    relatorio['cv_scores_mean'].append(np.mean(cv_scores))
    relatorio['test_score'].append(test_score)
    relatorio['estimador'].append(reg)

Treinando Modelo ARDRegression
R2 Score Train: 0.4582655590036737
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.26675860576821575
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.33479054619233783
R2 Score Valid: 0.18 +- 0.18
R2 Score Test: 0.017317613030278567
Treinando Modelo BaggingRegressor
R2 Score Train: 0.753025563851684
R2 Score Valid: 0.27 +- 0.11
R2 Score Test: 0.2587121042109928
Treinando Modelo BayesianRidge
R2 Score Train: 0.45820267291635575
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2690466546246023
Treinando Modelo CCA
R2 Score Train: 0.3031613462213182
R2 Score Valid: 0.22 +- 0.21
R2 Score Test: -0.4838806801486233
Treinando Modelo DecisionTreeRegressor




R2 Score Train: 0.8381916859631515
R2 Score Valid: -0.06 +- 0.28
R2 Score Test: -0.04218809463764961
Treinando Modelo DummyRegressor
R2 Score Train: 0.0
R2 Score Valid: -0.02 +- 0.02
R2 Score Test: -0.012722806117509977
Treinando Modelo ElasticNetCV
R2 Score Train: 0.3637139154260802
R2 Score Valid: 0.31 +- 0.11
R2 Score Test: 0.2003675862457801
Treinando Modelo ExtraTreeRegressor
R2 Score Train: 0.8381916859631515
R2 Score Valid: 0.07 +- 0.23
R2 Score Test: 0.07692493009157486
Treinando Modelo ExtraTreesRegressor
R2 Score Train: 0.8381915903506333
R2 Score Valid: 0.20 +- 0.16
R2 Score Test: 0.2555816123256752
Treinando Modelo GammaRegressor
R2 Score Train: 0.37827582760557465
R2 Score Valid: 0.35 +- 0.05
R2 Score Test: 0.323905342341279
Treinando Modelo GaussianProcessRegressor


  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)


R2 Score Train: 0.8381916859559063
R2 Score Valid: -0.29 +- 0.12
R2 Score Test: -0.09790560042544438
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.6424235817962513
R2 Score Valid: 0.32 +- 0.08
R2 Score Test: 0.2608135121523971
Treinando Modelo HistGradientBoostingRegressor
R2 Score Train: 0.5981073857486785
R2 Score Valid: 0.37 +- 0.09
R2 Score Test: 0.4057661423383503
Treinando Modelo HuberRegressor
R2 Score Train: 0.42133876897818645
R2 Score Valid: 0.39 +- 0.04
R2 Score Test: 0.3739162381181408
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.523835009863003
R2 Score Valid: 0.21 +- 0.15
R2 Score Test: 0.3747385483134712
Treinando Modelo KernelRidge
R2 Score Train: 0.45825255367964257
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2685366489969794
Treinando Modelo Lars
R2 Score Train: 0.4583095625580632
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2674457175537984
Treinando Modelo LarsCV




R2 Score Train: 0.45788670140266097
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.27941602502309026
Treinando Modelo Lasso
R2 Score Train: 0.45830370145689325
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.26715562647043023
Treinando Modelo LassoCV
R2 Score Train: 0.4389978140428077
R2 Score Valid: 0.40 +- 0.09
R2 Score Test: 0.2646415348912624
Treinando Modelo LassoLars
R2 Score Train: 0.4573382630073526
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.2863730194466877
Treinando Modelo LassoLarsCV
R2 Score Train: 0.45788670140266097
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.27941602502309026
Treinando Modelo LassoLarsIC
R2 Score Train: 0.45817083850230733
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.2732691594513621
Treinando Modelo LinearRegression
R2 Score Train: 0.4583095625580632
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.267445717553804
Treinando Modelo LinearSVR
R2 Score Train: 0.3118091491805205
R2 Score Valid: 0.25 +- 0.14
R2 Score Test: 0.027769596317125167
Treinando Mode



R2 Score Train: 0.26317547679045805
R2 Score Valid: 0.20 +- 0.18
R2 Score Test: -0.22877535940778215
Treinando Modelo NuSVR
R2 Score Train: -0.01094640771192723
R2 Score Valid: -0.02 +- 0.05
R2 Score Test: 0.01742868655640395
Treinando Modelo OrthogonalMatchingPursuit
R2 Score Train: 0.3634412854406043
R2 Score Valid: 0.31 +- 0.11
R2 Score Test: 0.18721176845465193
Treinando Modelo OrthogonalMatchingPursuitCV
R2 Score Train: 0.4582830863931011
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2647878686864421
Treinando Modelo PLSCanonical
R2 Score Train: 0.04062490041919198
R2 Score Valid: -0.10 +- 0.36
R2 Score Test: -0.7367659008753038
Treinando Modelo PLSRegression
R2 Score Train: 0.44990103845492824
R2 Score Valid: 0.41 +- 0.07
R2 Score Test: 0.31807589776261114
Treinando Modelo PassiveAggressiveRegressor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


R2 Score Train: -1.4416716802548835
R2 Score Valid: -1.67 +- 3.58
R2 Score Test: -4.659978770926227
Treinando Modelo PoissonRegressor
R2 Score Train: 0.4738743254577996
R2 Score Valid: 0.44 +- 0.05
R2 Score Test: 0.21906045847103328
Treinando Modelo RANSACRegressor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


R2 Score Train: 0.32050157689986647
R2 Score Valid: 0.31 +- 0.10
R2 Score Test: 0.3453068875455214
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.7766924401448663
R2 Score Valid: 0.25 +- 0.16
R2 Score Test: 0.34099174105598073
Treinando Modelo Ridge
R2 Score Train: 0.45830751610203424
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2676853767521
Treinando Modelo RidgeCV
R2 Score Train: 0.4581210448683818
R2 Score Valid: 0.42 +- 0.08
R2 Score Test: 0.2695055663797735
Treinando Modelo SGDRegressor
R2 Score Train: -1.023223203640987e+17
R2 Score Valid: -3788721462898056192.00 +- 6772915353790668800.00
R2 Score Test: -1.7967794938445386e+17
Treinando Modelo SVR
R2 Score Train: -0.05290574612528798
R2 Score Valid: -0.06 +- 0.05
R2 Score Test: -0.01807220076769367
Treinando Modelo TheilSenRegressor
R2 Score Train: 0.43211426622184257
R2 Score Valid: 0.40 +- 0.06
R2 Score Test: 0.29954339852379686
Treinando Modelo TransformedTargetRegressor
R2 Score Train: 0.4583095625580632
R2 Score 

In [23]:
relatorio = pd.DataFrame(relatorio).sort_values(by='cv_scores_mean', ascending=False)
relatorio.head(10)

Unnamed: 0,nome,train_score,cv_scores_mean,test_score,estimador
33,PoissonRegressor,0.473874,0.442586,0.21906,PoissonRegressor()
29,OrthogonalMatchingPursuitCV,0.458283,0.419052,0.264788,OrthogonalMatchingPursuitCV()
21,LassoLars,0.457338,0.41796,0.286373,LassoLars()
0,ARDRegression,0.458266,0.417843,0.266759,ARDRegression()
23,LassoLarsIC,0.458171,0.416688,0.273269,LassoLarsIC()
22,LassoLarsCV,0.457887,0.41666,0.279416,LassoLarsCV()
18,LarsCV,0.457887,0.416659,0.279416,LarsCV()
19,Lasso,0.458304,0.416394,0.267156,Lasso()
3,BayesianRidge,0.458203,0.416388,0.269047,BayesianRidge()
36,Ridge,0.458308,0.416327,0.267685,Ridge()


### 3.4 TODO: Calibrar melhores estimadores usando GridSearchCV

In [24]:
## Usar GridSearchCV para calibrar os melhores estimadores
from sklearn.model_selection import GridSearchCV

### 3.5 (Opcional) Combinar os melhores estimadores usando Stacking

In [25]:
from sklearn.ensemble import StackingRegressor

top_3_regs = relatorio[['nome', 'estimador']].values[:3]

reg = StackingRegressor(
    estimators=top_3_regs
)

reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"R2 Score Train: {train_score}")
print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
print(f"R2 Score Test: {test_score}")
print('='*80)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATION

R2 Score Train: 0.45948464934473476
R2 Score Valid: 0.42 +- 0.07
R2 Score Test: 0.12177210994289722


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATION

In [26]:
reg

StackingRegressor(estimators=array([['PoissonRegressor', PoissonRegressor()],
       ['OrthogonalMatchingPursuitCV', OrthogonalMatchingPursuitCV()],
       ['LassoLars', LassoLars()]], dtype=object))

## 4. Exportar Modelo

In [27]:
import pickle
pickle.dump(reg, open('regressor.pkl', 'wb'), protocol=4)

In [28]:
ls

[0m[01;34marquivos[0m/                          descricao_vaga_uni.txt
awari-ds-aula7-numpy-pandas.ipynb  [01;34mdriver[0m/
boas_praticas_ml.ipynb             imports_de_dados.ipynb
coletor_vagas.py                   regressor.pkl


In [26]:
%reset -f

In [29]:
import pickle
reg = pickle.load(open('regressor.pkl', 'rb'))

In [30]:
reg

StackingRegressor(estimators=array([['PoissonRegressor', PoissonRegressor()],
       ['OrthogonalMatchingPursuitCV', OrthogonalMatchingPursuitCV()],
       ['LassoLars', LassoLars()]], dtype=object))

In [34]:
import numpy as np
reg.predict([[120, 2, 0, 0, 0, 1]])

array([3795.93508108])