In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler  

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

# https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_catboostregressor-docpage/

In [2]:
def obtener_rmse(col_true, col_pred):
    return mean_squared_error(col_true, col_pred)**0.5

In [3]:
propiedades = pd.read_csv('/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/datos para el tp2/set_datos_propiedades.csv')

In [4]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull()),\
                             ['place_name_encoded', 'property_type_encoded','price_aprox_usd','superficie',\
                             'Year','Month','seguridad','aire','gimnasio','cochera','pileta']]

In [6]:
propiedades.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132495 entries, 0 to 1413024
Data columns (total 11 columns):
place_name_encoded       1132495 non-null int64
property_type_encoded    1132495 non-null int64
price_aprox_usd          1132495 non-null float64
superficie               1132495 non-null float64
Year                     1132495 non-null int64
Month                    1132495 non-null int64
seguridad                1132495 non-null bool
aire                     1132495 non-null bool
gimnasio                 1132495 non-null bool
cochera                  1132495 non-null bool
pileta                   1132495 non-null bool
dtypes: bool(5), float64(2), int64(4)
memory usage: 65.9 MB


# Catboost

In [5]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [6]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [12]:
cat = CatBoostRegressor()
cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
print('Error: ', error)

('Error: ', 64837103002.541389)


## Ahora que tenemos una intuicion, probamos cambiando los parametros

In [15]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [16]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

res = []

In [21]:
loss_function = ['RMSE', 'LogLinQuantile', 'MAE', 'MAPE', 'Poisson', 'Quantile', ]
depth = [1, 3, 7]
learning = [0.1, 0.3, 0.5]

for d in depth:
    for l in learning:
        for lf in loss_function:
            try:
                cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l)
                cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
                set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
                error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
                res.append((d, l, lf, error))
                print(d, l, lf)
            except:
                continue

(1, 0.1, 'RMSE')
(1, 0.1, 'LogLinQuantile')
(1, 0.1, 'MAE')
(1, 0.1, 'MAPE')
(1, 0.1, 'Quantile')
(1, 0.3, 'RMSE')
(1, 0.3, 'MAE')
(1, 0.3, 'MAPE')
(1, 0.3, 'Quantile')
(1, 0.5, 'RMSE')
(1, 0.5, 'MAE')
(1, 0.5, 'MAPE')
(1, 0.5, 'Quantile')
(3, 0.1, 'RMSE')
(3, 0.1, 'LogLinQuantile')
(3, 0.1, 'MAE')
(3, 0.1, 'MAPE')
Training has stopped (degenerate solution on iteration 303, probably too small l2-regularization, try to increase it)
(3, 0.1, 'Quantile')
(3, 0.3, 'RMSE')
Training has stopped (degenerate solution on iteration 33, probably too small l2-regularization, try to increase it)
(3, 0.3, 'MAE')
(3, 0.3, 'MAPE')
Training has stopped (degenerate solution on iteration 143, probably too small l2-regularization, try to increase it)
(3, 0.3, 'Quantile')
(3, 0.5, 'RMSE')
(3, 0.5, 'MAE')
(3, 0.5, 'MAPE')
Training has stopped (degenerate solution on iteration 27, probably too small l2-regularization, try to increase it)
(3, 0.5, 'Quantile')
(7, 0.1, 'RMSE')
(7, 0.1, 'LogLinQuantile')
(7, 0.

In [22]:
for r in res:
    print("depth = {}, learning_rate = {}, loss_function = {},  error = {}".format(r[0],r[1],r[2],r[3]))

depth = 1, learning_rate = 0.1, loss_function = RMSE,  error = 80955325787.0
depth = 1, learning_rate = 0.1, loss_function = LogLinQuantile,  error = 2.82096904532e+11
depth = 1, learning_rate = 0.1, loss_function = MAE,  error = 2.08101531638e+11
depth = 1, learning_rate = 0.1, loss_function = MAPE,  error = 2.08114774078e+11
depth = 1, learning_rate = 0.1, loss_function = RMSE,  error = 80859322386.8
depth = 1, learning_rate = 0.1, loss_function = LogLinQuantile,  error = 2.81016221648e+11
depth = 1, learning_rate = 0.1, loss_function = MAE,  error = 2.08101531639e+11
depth = 1, learning_rate = 0.1, loss_function = MAPE,  error = 2.08114774078e+11
depth = 1, learning_rate = 0.1, loss_function = Quantile,  error = 2.0810153164e+11
depth = 1, learning_rate = 0.3, loss_function = RMSE,  error = 79650734679.2
depth = 1, learning_rate = 0.3, loss_function = MAE,  error = 2.08075050195e+11
depth = 1, learning_rate = 0.3, loss_function = MAPE,  error = 2.08114773774e+11
depth = 1, learning_

In [25]:
min_error = float('inf')
tupla_min_error = ()
for r in res:
    if r[3] < min_error:
        min_error = r[3]
        tupla_min_error = r
        
print("depth = {}, learning_rate = {}, loss_function = {},  error = {}".\
    format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3]))

depth = 7, learning_rate = 0.5, loss_function = RMSE,  error = 49613284230.6


#### Se puede observar que los mejores resultados se obtienen con loss_funcion = RMSE. Por lo que continuaremos con esta funcion y variando los otros parametros para intentar alcanzar un mejor resultado

In [28]:
lf = 'RMSE'
leaf_estimation_method = ['Newton', 'Gradient']
depth = [3, 7, 9]
learning = [0.3, 0.5, 0.7]

for d in depth:
    for l in learning:
        for lem in leaf_estimation_method:
            cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l, leaf_estimation_method = lem)
            cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
            set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
            error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
            res.append((d, l, lf, lem, error))
            print(d, l, lem)

(3, 0.3, 'Newton')
(3, 0.3, 'Gradient')
(3, 0.5, 'Newton')
(3, 0.5, 'Gradient')
(3, 0.7, 'Newton')
(3, 0.7, 'Gradient')
(7, 0.3, 'Newton')
(7, 0.3, 'Gradient')
(7, 0.5, 'Newton')
(7, 0.5, 'Gradient')
(7, 0.7, 'Newton')
(7, 0.7, 'Gradient')
(9, 0.3, 'Newton')
(9, 0.3, 'Gradient')
(9, 0.5, 'Newton')
(9, 0.5, 'Gradient')
(9, 0.7, 'Newton')
(9, 0.7, 'Gradient')


In [37]:
for r in res[43:]:
    print("depth = {}, learning_rate = {}, loss_function = {}, leaf_estimation_method {}, error = {}".format(r[0],r[1],r[2],r[3], r[4]))

depth = 3, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method Newton, error = 61739913464.5
depth = 3, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method Gradient, error = 63088049605.2
depth = 3, learning_rate = 0.5, loss_function = RMSE, leaf_estimation_method Newton, error = 62436188793.4
depth = 3, learning_rate = 0.5, loss_function = RMSE, leaf_estimation_method Gradient, error = 61846263745.1
depth = 3, learning_rate = 0.7, loss_function = RMSE, leaf_estimation_method Newton, error = 61185064987.1
depth = 3, learning_rate = 0.7, loss_function = RMSE, leaf_estimation_method Gradient, error = 61443317708.3
depth = 7, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method Newton, error = 50664512437.9
depth = 7, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method Gradient, error = 48594860802.2
depth = 7, learning_rate = 0.5, loss_function = RMSE, leaf_estimation_method Newton, error = 48346598053.9
depth = 7, learning_rate = 0.

In [38]:
min_error = float('inf')
tupla_min_error = ()
for r in res[43:]:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
        
print("depth = {}, learning_rate = {}, loss_function = {},  leaf_estimation_method {},error = {}".\
    format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

depth = 9, learning_rate = 0.7, loss_function = RMSE,  leaf_estimation_method Newton,error = 43683632431.8


## Elegimos depth = 9, learning_rate = 0.7, loss_function = RMSE,  leaf_estimation_method Newton

In [42]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [43]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [48]:
lf = 'RMSE'
lem = 'Newton'
d = 9
l = 0.7

cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l, leaf_estimation_method = lem)
cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
print(error)

44513170231.5


# calculo con los verdaderos datos a analizar

In [49]:
analizar = pd.read_csv("/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/datos para el tp2/properati_dataset_modificado.csv")

In [50]:
analizar.loc[:,'price_usd'] = cat.predict(Pool(analizar.loc[:,columnas]))

In [51]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.423444e+05
std      2.777354e+05
min     -1.270310e+06
25%      1.113211e+05
50%      1.626517e+05
75%      2.713292e+05
max      8.849407e+06
Name: price_usd, dtype: float64

In [52]:
# Las predicciones con precio negativo las convierto en valores positivos
analizar.loc[:,'price_usd'] = analizar.loc[:,"price_usd"].apply(lambda x: abs(x))

In [53]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.439598e+05
std      2.763174e+05
min      5.011507e+03
25%      1.118089e+05
50%      1.630035e+05
75%      2.722750e+05
max      8.849407e+06
Name: price_usd, dtype: float64

In [54]:
resultado = analizar.loc[:,['id','price_usd']]

In [55]:
resultado.to_csv('resultados/catboost_resultados.csv', index = False)

## Utilizando Standard Scaler

In [6]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [7]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [8]:
scaler = StandardScaler()  
scaler.fit(set_entrenamiento_datos)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [12]:
lf = 'RMSE'
lem = 'Newton'
d = 9
l = 0.7

cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l, leaf_estimation_method = lem)
cat.fit(Pool(scaler.transform(set_entrenamiento_datos),set_entrenamiento_resultado))
set_pruebas.loc[:,'resultado'] = cat.predict(Pool(scaler.transform(set_pruebas.loc[:,columnas])))
error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
print(error)

47316459550.5


#### Los resultados obtenidos con standard scaler son peores que los obtenidos sin standard scaler

# Ahora uso lat y lon en lugar de place name

In [4]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull())\
                              & (propiedades.lat.notnull()) & (propiedades.lon.notnull()),\
                             ['lat', 'lon', 'property_type_encoded','price_aprox_usd','superficie',\
                             'Year','Month','seguridad','aire','gimnasio','cochera','pileta']]

In [5]:
columnas = ['superficie','lat', 'lon','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [6]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [7]:
cat = CatBoostRegressor()
cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
print(error)

46624066719.2


In [12]:
lf = 'RMSE'
lem = 'Newton'
d = 9
l = 0.7

cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l, leaf_estimation_method = lem)
cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
print(error)

21465356596.3


In [13]:
res = []

In [14]:
loss_function = ['RMSE', 'LogLinQuantile', 'MAE', 'MAPE', 'Poisson', 'Quantile', ]
depth = [1, 3, 7]
learning = [0.1, 0.3, 0.5]

for d in depth:
    for l in learning:
        for lf in loss_function:
            try:
                cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l)
                cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
                set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
                error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
                res.append((d, l, lf, error))
                print(d, l, lf)
            except:
                continue

(1, 0.1, 'RMSE')
(1, 0.1, 'LogLinQuantile')
(1, 0.1, 'MAE')
(1, 0.1, 'MAPE')
(1, 0.1, 'Quantile')
(1, 0.3, 'RMSE')
(1, 0.3, 'MAE')
(1, 0.3, 'MAPE')
(1, 0.3, 'Quantile')
(1, 0.5, 'RMSE')
(1, 0.5, 'MAE')
(1, 0.5, 'MAPE')
(1, 0.5, 'Quantile')
(3, 0.1, 'RMSE')
(3, 0.1, 'LogLinQuantile')
(3, 0.1, 'MAE')
(3, 0.1, 'MAPE')
Training has stopped (degenerate solution on iteration 307, probably too small l2-regularization, try to increase it)
(3, 0.1, 'Quantile')
(3, 0.3, 'RMSE')
(3, 0.3, 'MAE')
(3, 0.3, 'MAPE')
Training has stopped (degenerate solution on iteration 25, probably too small l2-regularization, try to increase it)
(3, 0.3, 'Quantile')
(3, 0.5, 'RMSE')
(3, 0.5, 'MAE')
(3, 0.5, 'MAPE')
Training has stopped (degenerate solution on iteration 61, probably too small l2-regularization, try to increase it)
(3, 0.5, 'Quantile')
(7, 0.1, 'RMSE')
(7, 0.1, 'LogLinQuantile')
(7, 0.1, 'MAE')
(7, 0.1, 'MAPE')
Training has stopped (degenerate solution on iteration 223, probably too small l2-regulariz

In [15]:
for r in res:
    print("depth = {}, learning_rate = {}, loss_function = {},  error = {}".format(r[0],r[1],r[2],r[3]))

depth = 1, learning_rate = 0.1, loss_function = RMSE,  error = 68395823536.0
depth = 1, learning_rate = 0.1, loss_function = LogLinQuantile,  error = 2.45760139419e+11
depth = 1, learning_rate = 0.1, loss_function = MAE,  error = 1.76633662069e+11
depth = 1, learning_rate = 0.1, loss_function = MAPE,  error = 1.76645996248e+11
depth = 1, learning_rate = 0.1, loss_function = Quantile,  error = 1.7663366207e+11
depth = 1, learning_rate = 0.3, loss_function = RMSE,  error = 67909222571.9
depth = 1, learning_rate = 0.3, loss_function = MAE,  error = 1.76608997162e+11
depth = 1, learning_rate = 0.3, loss_function = MAPE,  error = 1.76645995947e+11
depth = 1, learning_rate = 0.3, loss_function = Quantile,  error = 1.76608997139e+11
depth = 1, learning_rate = 0.5, loss_function = RMSE,  error = 67969670620.1
depth = 1, learning_rate = 0.5, loss_function = MAE,  error = 1.76584337257e+11
depth = 1, learning_rate = 0.5, loss_function = MAPE,  error = 1.76645995645e+11
depth = 1, learning_rate =

In [16]:
min_error = float('inf')
tupla_min_error = ()
for r in res:
    if r[3] < min_error:
        min_error = r[3]
        tupla_min_error = r
        
print("depth = {}, learning_rate = {}, loss_function = {},  error = {}".\
    format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3]))

depth = 7, learning_rate = 0.5, loss_function = RMSE,  error = 28964960487.9


In [17]:
lf = 'RMSE'
leaf_estimation_method = ['Newton', 'Gradient']
depth = [3, 7, 9]
learning = [0.3, 0.5, 0.7]

for d in depth:
    for l in learning:
        for lem in leaf_estimation_method:
            cat = CatBoostRegressor(loss_function = lf, depth = d, learning_rate = l, leaf_estimation_method = lem)
            cat.fit(Pool(set_entrenamiento_datos,set_entrenamiento_resultado))
            set_pruebas.loc[:,'resultado'] = cat.predict(Pool(set_pruebas.loc[:,columnas]))
            error = mean_squared_error(set_pruebas.price_aprox_usd, set_pruebas.resultado)
            res.append((d, l, lf, lem, error))
            print(d, l, lem)

(3, 0.3, 'Newton')
(3, 0.3, 'Gradient')
(3, 0.5, 'Newton')
(3, 0.5, 'Gradient')
(3, 0.7, 'Newton')
(3, 0.7, 'Gradient')
(7, 0.3, 'Newton')
(7, 0.3, 'Gradient')
(7, 0.5, 'Newton')
(7, 0.5, 'Gradient')
(7, 0.7, 'Newton')
(7, 0.7, 'Gradient')
(9, 0.3, 'Newton')
(9, 0.3, 'Gradient')
(9, 0.5, 'Newton')
(9, 0.5, 'Gradient')
(9, 0.7, 'Newton')
(9, 0.7, 'Gradient')


In [18]:
for r in res[43:]:
    print("depth = {}, learning_rate = {}, loss_function = {}, leaf_estimation_method = {}, error = {}".format(r[0],r[1],r[2],r[3], r[4]))

depth = 3, learning_rate = 0.7, loss_function = RMSE, leaf_estimation_method = Newton, error = 49463450903.8
depth = 3, learning_rate = 0.7, loss_function = RMSE, leaf_estimation_method = Gradient, error = 47187635004.9
depth = 7, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method = Newton, error = 32389837138.2
depth = 7, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method = Gradient, error = 33197632566.7
depth = 7, learning_rate = 0.5, loss_function = RMSE, leaf_estimation_method = Newton, error = 28746005800.9
depth = 7, learning_rate = 0.5, loss_function = RMSE, leaf_estimation_method = Gradient, error = 28999671682.2
depth = 7, learning_rate = 0.7, loss_function = RMSE, leaf_estimation_method = Newton, error = 28137251040.1
depth = 7, learning_rate = 0.7, loss_function = RMSE, leaf_estimation_method = Gradient, error = 28065297773.6
depth = 9, learning_rate = 0.3, loss_function = RMSE, leaf_estimation_method = Newton, error = 26177239354.3
depth = 9, 

In [19]:
min_error = float('inf')
tupla_min_error = ()
for r in res[43:]:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
        
print("depth = {}, learning_rate = {}, loss_function = {},  leaf_estimation_method = {},error = {}".\
    format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

depth = 9, learning_rate = 0.7, loss_function = RMSE,  leaf_estimation_method = Newton,error = 24098199566.8


## Calculo de los verdaderos datos a analizar

In [20]:
analizar = pd.read_csv("/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/tpDatos/properati_dataset_modificado.csv")

In [22]:
analizar.loc[:,'price_usd'] = cat.predict(Pool(analizar.loc[:,columnas]))

In [23]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.710392e+05
std      4.407396e+05
min     -1.693840e+06
25%      1.066302e+05
50%      1.562886e+05
75%      2.721992e+05
max      1.888102e+07
Name: price_usd, dtype: float64

In [26]:
analizar.loc[:, 'price_usd'] = analizar.loc[:, 'price_usd'].apply(lambda x: abs(x))

In [27]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.744049e+05
std      4.386519e+05
min      5.672223e+02
25%      1.070302e+05
50%      1.569190e+05
75%      2.734567e+05
max      1.888102e+07
Name: price_usd, dtype: float64

In [28]:
resultado = analizar.loc[:,['id','price_usd']]
resultado.to_csv('resultados/Catboost_lat_lon_resultados.csv', index = False)