In [6]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [21]:
SEED = 1234

In [8]:
data = pd.read_csv("../data/casas.csv")
data.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


In [9]:
data.isna().sum()

tamanho    0
ano        0
garagem    0
preco      0
dtype: int64

In [10]:
data.dtypes

tamanho    float64
ano          int64
garagem      int64
preco        int64
dtype: object

In [12]:
y = data["preco"]
x = data.drop("preco", axis = "columns")

## Divisão em treino e teste

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=SEED, test_size=0.3)

In [16]:
x_train

Unnamed: 0,tamanho,ano,garagem
1017,126.0,1984,2
405,151.0,1976,2
6,157.0,2004,2
388,137.0,1999,2
501,164.0,2005,2
...,...,...,...
1228,158.0,2008,3
1077,102.0,1969,1
1318,166.0,2001,3
723,137.0,1954,1


In [17]:
x_test

Unnamed: 0,tamanho,ano,garagem
605,185.0,1965,2
642,251.0,1972,2
993,139.0,2005,2
736,97.0,1950,2
1239,138.0,2006,2
...,...,...,...
805,133.0,2008,2
112,250.0,2007,3
348,151.0,2003,2
205,134.0,1990,2


In [18]:
y_train

1017    187500
405     150000
6       307000
388     191000
501     226700
         ...  
1228    367294
1077    138800
1318    275000
723     135000
815     224900
Name: preco, Length: 1022, dtype: int64

In [19]:
y_test

605     205000
642     345000
993     173900
736      93500
1239    265900
         ...  
805     227680
112     383970
348     154000
205     180500
622     135000
Name: preco, Length: 438, dtype: int64

## Instanciando algoritmos de regressão

In [22]:
lm = LinearRegression()
dt = DecisionTreeRegressor(max_depth=10, random_state=SEED)
rf = RandomForestRegressor(n_estimators=190, max_depth=8, random_state=SEED)
gb = GradientBoostingRegressor(random_state=SEED)

In [23]:
def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs(y_pred - y_true) / y_true)
    r2 = r2_score(y_true, y_pred)
    return {'rmse': rmse, 'mape': mape, 'r2': r2, 'mae': mae}

## Treino e teste

In [24]:
## treinamento dos modelos
lm.fit(x_train, y_train)
rf.fit(x_train, y_train)
gb.fit(x_train, y_train)
dt.fit(x_train, y_train)

## gerando previsões
y_pred_lm = lm.predict(x_test)
y_pred_rf = rf.predict(x_test)
y_pred_gb = gb.predict(x_test)
y_pred_dt = dt.predict(x_test)

## Obtendo métricas para cada modelo

In [25]:
## gradient boosting
metrics(y_test, y_pred=y_pred_gb)

{'rmse': 35990.14511783617,
 'mape': 0.14561870275808456,
 'r2': 0.7408378597152022,
 'mae': 24239.664238387923}

In [26]:
## random forest
metrics(y_test, y_pred=y_pred_rf)

{'rmse': 34525.39120530156,
 'mape': 0.14199846114191983,
 'r2': 0.761503736820618,
 'mae': 23488.123638116384}

In [27]:
## linear regression
metrics(y_test, y_pred=y_pred_lm)

{'rmse': 39186.53798532828,
 'mape': 0.17328121157519924,
 'r2': 0.6927597177686979,
 'mae': 28858.425280647614}

In [28]:
## decision tree
metrics(y_test, y_pred=y_pred_dt)

{'rmse': 42222.6735526058,
 'mape': 0.16946319623038944,
 'r2': 0.6433059858455655,
 'mae': 28508.438300179096}

## Salvando modelo

In [30]:
import pickle as pk

with open('../models/model_rf.pkl', 'wb') as arquivo_model:
    pk.dump(rf, arquivo_model)
