In [2]:
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as r2
import pandas as pd

In [3]:
# levanto la data 

df=pd.read_csv('df_limpio.csv',index_col=0) 

In [4]:
df

Unnamed: 0,tipo,barrio,sup,pricem2,habs
0,PH,Mataderos,55.0,1127.27,2.0
1,apartment,Mataderos,55.0,1309.09,2.0
2,apartment,Belgrano,45.0,3066.67,1.0
3,apartment,Belgrano,65.0,3000.00,2.0
4,apartment,Palermo,50.0,2234.00,1.0
...,...,...,...,...,...
26356,apartment,Belgrano,38.0,3368.42,1.0
26357,apartment,Recoleta,44.0,3750.00,2.0
26358,apartment,Belgrano,157.0,2611.46,4.0
26359,apartment,Belgrano,157.0,2611.46,4.0


In [6]:
X=df.drop('pricem2',axis=1)
y=df.pricem2

In [7]:
# tranformadores
Preprocesamiento = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0,1])], 
    remainder='passthrough'                                       
)
X_dummies=Preprocesamiento.fit_transform(X)
X_dummies.toarray().shape

(24733, 63)

In [74]:
pd.get_dummies(X)

Unnamed: 0,sup,habs,tipo_PH,tipo_apartment,tipo_house,tipo_store,barrio_Abasto,barrio_Agronomía,barrio_Almagro,barrio_Balvanera,...,barrio_Villa Lugano,barrio_Villa Luro,barrio_Villa Ortuzar,barrio_Villa Pueyrredón,barrio_Villa Real,barrio_Villa Riachuelo,barrio_Villa Santa Rita,barrio_Villa Soldati,barrio_Villa Urquiza,barrio_Villa del Parque
0,55.0,2.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,55.0,2.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,45.0,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,65.0,2.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26356,38.0,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26357,44.0,2.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26358,157.0,4.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26359,157.0,4.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# split
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y)

# instancio y entreno el modelo

rf = RF()
rf.fit(X_train, y_train)

In [9]:
y_pred=rf.predict(X_test)

In [10]:
r2(y_test,y_pred)

0.5130208258620291

# Como podemos procesar nuevas predicciones?

Los nuevos datos tienen que poder ser alimentados al metodo `.predict()` 

Tienen que tener la misma estructura que `X_test`

In [11]:
X_test.shape

(6184, 63)

La aplicacion web solo recibe los siguientes datos:

`tipo`
`barrio`
`superficie`
`habitaciones`

Debemos crear una funcion que preprocese estos 4 inputs en 61 columnas

Partamos del ejemplo de un inmueble con estas caracteristicas:

PH, Recoleta, 30m2 y 3 habitaciones

```python
tipo='ph'
barrio='recoleta'
sup=30
habs=3
```

In [34]:
tipo='PH'
barrio='Recoleta'
sup=30
habs=3

In [92]:
X_pred=pd.DataFrame(columns=['tipo','barrio','sup','habs'])
X_pred.loc[0,:]=[tipo,barrio,sup,habs]

In [94]:
X_pred

Unnamed: 0,tipo,barrio,sup,habs
0,PH,Recoleta,30,3


In [95]:
X_pred_dummies=Preprocesamiento.transform(X_pred)

# Modelo listo para la produccion

In [96]:
rf.predict(X_pred_dummies)

array([3173.02012359])

In [97]:
def pipeline(tipo,barrio,sup,habs):
    X_pred=pd.DataFrame(columns=['tipo','barrio','sup','habs'])
    X_pred.loc[0,:]=[tipo,barrio,sup,habs]
    X_pred_dummies=Preprocesamiento.transform(X_pred)    
    return rf.predict(X_pred_dummies)[0]

In [98]:
pipeline('apartment','Almagro',40,3)

2143.520446284271

# Persistencia

In [12]:
from joblib import dump
dump([rf,Preprocesamiento], 'modelo_zip.joblib',compress=('gzip',9)) 

['modelo_zip.joblib']