In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
import time

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)
#https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html

In [2]:
#propiedades = pd.read_csv('/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/datos para el tp2/set_datos_propiedades.csv')
propiedades = pd.read_csv('../../set_datos_propiedades.csv')

In [3]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull()),\
                             ['place_name_encoded', 'property_type_encoded','price_aprox_usd','superficie',\
                             'Year','Month','seguridad','aire','gimnasio','cochera','pileta']]

In [4]:
propiedades.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132495 entries, 0 to 1413024
Data columns (total 11 columns):
place_name_encoded       1132495 non-null int64
property_type_encoded    1132495 non-null int64
price_aprox_usd          1132495 non-null float64
superficie               1132495 non-null float64
Year                     1132495 non-null int64
Month                    1132495 non-null int64
seguridad                1132495 non-null bool
aire                     1132495 non-null bool
gimnasio                 1132495 non-null bool
cochera                  1132495 non-null bool
pileta                   1132495 non-null bool
dtypes: bool(5), float64(2), int64(4)
memory usage: 65.9 MB


# LightGBM

In [9]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [10]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [29]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
print("Precision = {}%, error = {}".format(precision, error))

Precision = 0.230676213694%, error = 1.37641955262e+11


### Ahora que tenemos una intuicion, probamos variando los parametros

In [30]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [33]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

res = []

In [34]:
boosting = ['gbdt', 'dart', 'goss', 'rf']
estimators = [10, 20, 50]
learning_rate = [0.1, 0.3, 0.5]

for b in boosting:
    for e in estimators:
        for lr in learning_rate:
            try:
                lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = e, learning_rate = lr)
                lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
                set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
                precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
                error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
                res.append((b, e, lr, precision, error))
                print(b, '-', e, '-', lr)
            except:
                continue

('gbdt', '-', 10, '-', 0.1)
('gbdt', '-', 10, '-', 0.3)
('gbdt', '-', 10, '-', 0.5)
('gbdt', '-', 20, '-', 0.1)
('gbdt', '-', 20, '-', 0.3)
('gbdt', '-', 20, '-', 0.5)
('gbdt', '-', 50, '-', 0.1)
('gbdt', '-', 50, '-', 0.3)
('gbdt', '-', 50, '-', 0.5)
('dart', '-', 10, '-', 0.1)
('dart', '-', 10, '-', 0.3)
('dart', '-', 10, '-', 0.5)
('dart', '-', 20, '-', 0.1)
('dart', '-', 20, '-', 0.3)
('dart', '-', 20, '-', 0.5)
('dart', '-', 50, '-', 0.1)
('dart', '-', 50, '-', 0.3)
('dart', '-', 50, '-', 0.5)
('goss', '-', 10, '-', 0.1)
('goss', '-', 10, '-', 0.3)
('goss', '-', 10, '-', 0.5)
('goss', '-', 20, '-', 0.1)
('goss', '-', 20, '-', 0.3)
('goss', '-', 20, '-', 0.5)
('goss', '-', 50, '-', 0.1)
('goss', '-', 50, '-', 0.3)
('goss', '-', 50, '-', 0.5)


In [36]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.1, precision = 0.23 % , error = 1.37641955262e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.3, precision = 0.23 % , error = 1.37641955262e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.5, precision = 0.23 % , error = 1.37641955262e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.1, precision = 0.46 % , error = 1.37318841383e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.3, precision = 0.46 % , error = 1.37318841383e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.5, precision = 0.46 % , error = 1.37318841383e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.17 % , error = 1.36348628327e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.3, precision = 1.17 % , error = 1.3634861732e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.5, precision = 1.17 % , error = 1.36348603281e+11
boosting_type = dart, n_estim

In [37]:
min_error = float('inf')
max_precision = 0
tupla_min_error = ()
tupla_max_precision = ()
for r in res:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
    if r[3] > max_precision:
        max_precision = r[3]
        tupla_max_precision = r
        
print("Mayor precision = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_max_precision[0],tupla_max_precision[1],tupla_max_precision[2],tupla_max_precision[3], tupla_max_precision[4]))
print("Menor error = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

Mayor precision = boosting_type = goss, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11
Menor error = boosting_type = goss, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11


# Modifico max_depth

In [38]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio','aire','pileta','cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [39]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [40]:
max_depth = [1, 5, 10, 15]

In [41]:
for d in max_depth:
    lgbm = lgb.LGBMRegressor(boosting_type = 'goss', n_estimators = 50, learning_rate = 0.5, max_depth = d)
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
    precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
    error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
    res.append((b, e, lr, precision, error))
    print(b, '-', e, '-', lr)

('rf', '-', 50, '-', 0.5)
('rf', '-', 50, '-', 0.5)
('rf', '-', 50, '-', 0.5)
('rf', '-', 50, '-', 0.5)


In [42]:
for r in res[27:]:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.09 % , error = 1.36452039915e+11
boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.18 % , error = 1.36326866928e+11
boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11
boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11


## Aumentamos los valores de max depth y n_estimators para ver si mejoramos la precisión

In [47]:
lgbm = lgb.LGBMRegressor(boosting_type = 'goss', n_estimators = 5000, learning_rate = 2.9, max_depth = 1000)
lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
print("Precision = {}%, error = {}".format(precision, error))

Precision = 38.9816250348%, error = 84181070076.1


In [48]:
n_estimators = [5000, 10000, 50000, 100000]
b = 'goss'
lr = 2.9
d = 1000
res = []

for n in n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = n, learning_rate = lr, max_depth = d)
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
    precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
    error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
    res.append((b, n, lr, precision, error))
    print(n)

5000
10000
50000
100000


In [52]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = goss, n_estimators = 5000, learnig_rate = 2.9, precision = 38.98 % , error = 84181070076.1
boosting_type = goss, n_estimators = 10000, learnig_rate = 2.9, precision = 47.01 % , error = 73107356570.7
boosting_type = goss, n_estimators = 50000, learnig_rate = 2.9, precision = 54.98 % , error = 62112893806.8
boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 58.98 % , error = 56587427148.9


#### Se puede ver que al aumentar la cantidad de estimadores mejora la precision por lo que tomamos un número más grande

# Calculamos los verdaderos datos

In [53]:
analizar = pd.read_csv("/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/tpDatos/properati_dataset_modificado.csv")

In [55]:
analizar.loc[:,'price_usd'] = analizar.loc[:,columnas].apply(lambda x: lgbm.predict([x])[0],axis = 1)

In [56]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.471407e+05
std      2.493879e+05
min     -5.653215e+05
25%      1.145887e+05
50%      1.621351e+05
75%      2.909809e+05
max      8.571289e+06
Name: price_usd, dtype: float64

In [57]:
analizar.loc[:, 'price_usd'] = analizar.loc[:, 'price_usd'].apply(lambda x: abs(x))

In [58]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.477323e+05
std      2.488002e+05
min      6.880918e+03
25%      1.147296e+05
50%      1.624316e+05
75%      2.914919e+05
max      8.571289e+06
Name: price_usd, dtype: float64

In [59]:
resultado = analizar.loc[:,['id','price_usd']]

In [60]:
resultado.to_csv('resultados/lightgbm_resultados.csv', index = False)

## Ahora uso lat y lon en lugar de place name

In [3]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull())\
                              & (propiedades.lat.notnull()) & (propiedades.lon.notnull()),\
                             ['lat', 'lon', 'property_type_encoded','price_aprox_usd','superficie',\
                             'Year','Month','seguridad','aire','gimnasio','cochera','pileta']]

In [4]:
columnas = ['superficie','lat', 'lon','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [5]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [66]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
print("Precision = {}%, error = {}".format(precision, error))

Precision = 0.225789592842%, error = 1.15522485637e+11


In [67]:
res = []

In [68]:
boosting = ['gbdt', 'dart', 'goss', 'rf']
estimators = [10, 20, 50]
learning_rate = [0.1, 0.3, 0.5]

for b in boosting:
    for e in estimators:
        for lr in learning_rate:
            try:
                lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = e, learning_rate = lr)
                lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
                set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
                precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
                error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
                res.append((b, e, lr, precision, error))
                print(b, '-', e, '-', lr)
            except:
                continue

('gbdt', '-', 10, '-', 0.1)
('gbdt', '-', 10, '-', 0.3)
('gbdt', '-', 10, '-', 0.5)
('gbdt', '-', 20, '-', 0.1)
('gbdt', '-', 20, '-', 0.3)
('gbdt', '-', 20, '-', 0.5)
('gbdt', '-', 50, '-', 0.1)
('gbdt', '-', 50, '-', 0.3)
('gbdt', '-', 50, '-', 0.5)
('dart', '-', 10, '-', 0.1)
('dart', '-', 10, '-', 0.3)
('dart', '-', 10, '-', 0.5)
('dart', '-', 20, '-', 0.1)
('dart', '-', 20, '-', 0.3)
('dart', '-', 20, '-', 0.5)
('dart', '-', 50, '-', 0.1)
('dart', '-', 50, '-', 0.3)
('dart', '-', 50, '-', 0.5)
('goss', '-', 10, '-', 0.1)
('goss', '-', 10, '-', 0.3)
('goss', '-', 10, '-', 0.5)
('goss', '-', 20, '-', 0.1)
('goss', '-', 20, '-', 0.3)
('goss', '-', 20, '-', 0.5)
('goss', '-', 50, '-', 0.1)
('goss', '-', 50, '-', 0.3)
('goss', '-', 50, '-', 0.5)


In [69]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.1, precision = 0.23 % , error = 1.15522485637e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.3, precision = 0.23 % , error = 1.15522485637e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.5, precision = 0.23 % , error = 1.15522485637e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.1, precision = 0.48 % , error = 1.15225738506e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.3, precision = 0.48 % , error = 1.15225738506e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.5, precision = 0.48 % , error = 1.15225738506e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.24 % , error = 1.14345223916e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.3, precision = 1.24 % , error = 1.14345223916e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.5, precision = 1.24 % , error = 1.14345223916e+11
boosting_type = dart, n_esti

In [70]:
min_error = float('inf')
max_precision = 0
tupla_min_error = ()
tupla_max_precision = ()
for r in res:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
    if r[3] > max_precision:
        max_precision = r[3]
        tupla_max_precision = r
        
print("Mayor precision = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_max_precision[0],tupla_max_precision[1],tupla_max_precision[2],tupla_max_precision[3], tupla_max_precision[4]))
print("Menor error = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

Mayor precision = boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.24 % , error = 1.14345223916e+11
Menor error = boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.24 % , error = 1.14345223916e+11


In [71]:
n_estimators = [5000, 10000, 50000, 100000]
b = 'goss'
lr = 2.9
res = []

for n in n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = n, learning_rate = lr)
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
    precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
    error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
    res.append((b, n, lr, precision, error))
    print(n)

5000
10000
50000
100000


In [72]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = goss, n_estimators = 5000, learnig_rate = 2.9, precision = 39.90 % , error = 69586432759.9
boosting_type = goss, n_estimators = 10000, learnig_rate = 2.9, precision = 49.19 % , error = 58834615136.1
boosting_type = goss, n_estimators = 50000, learnig_rate = 2.9, precision = 62.36 % , error = 43585312292.7
boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 68.84 % , error = 36082874388.6


In [73]:
min_error = float('inf')
max_precision = 0
tupla_min_error = ()
tupla_max_precision = ()
for r in res:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
    if r[3] > max_precision:
        max_precision = r[3]
        tupla_max_precision = r
        
print("Mayor precision = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_max_precision[0],tupla_max_precision[1],tupla_max_precision[2],tupla_max_precision[3], tupla_max_precision[4]))
print("Menor error = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

Mayor precision = boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 68.84 % , error = 36082874388.6
Menor error = boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 68.84 % , error = 36082874388.6


# Calculamos los verdaderos datos

In [74]:
analizar = pd.read_csv("/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/tpDatos/properati_dataset_modificado.csv")

In [75]:
analizar.loc[:,'price_usd'] = analizar.loc[:,columnas].apply(lambda x: lgbm.predict([x])[0],axis = 1)

In [76]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.389744e+05
std      2.538136e+05
min     -1.128486e+06
25%      1.092698e+05
50%      1.590292e+05
75%      2.817779e+05
max      5.876935e+06
Name: price_usd, dtype: float64

In [77]:
analizar.loc[:, 'price_usd'] = analizar.loc[:, 'price_usd'].apply(lambda x: abs(x))

In [78]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.396352e+05
std      2.531898e+05
min      3.814273e+03
25%      1.093145e+05
50%      1.591093e+05
75%      2.818319e+05
max      5.876935e+06
Name: price_usd, dtype: float64

In [79]:
resultado = analizar.loc[:,['id','price_usd']]

In [80]:
resultado.to_csv('resultados/lightgbm_latlon_resultados.csv', index = False)

# Usando Grid Search y Cross Validation

In [3]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull())\
                              & (propiedades.lat.notnull()) & (propiedades.lon.notnull()) & (propiedades.Year >=2016)]

columnas = ['superficie','lat', 'lon','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

set_entrenamiento_datos = propiedades.loc[:,columnas]
set_entrenamiento_resultado = propiedades.loc[:,'price_aprox_usd']

In [4]:
boosting = ['gbdt', 'dart', 'goss'] # rf no se tiene en cuenta pues levanta excepcion
estimators = [10000]
learning_rate = [0.3, 0.5, 0.9]

parametros = {"n_estimators" : estimators, "learning_rate" : learning_rate, "boosting_type" : boosting}

iteraciones_cross_validation = 15
lgbm = lgb.LGBMRegressor()

In [None]:
inicio = time.strftime("%X")

grid = GridSearchCV( estimator = lgbm, param_grid = parametros, n_jobs = -1, cv = iteraciones_cross_validation)

grid.fit(set_entrenamiento_datos, set_entrenamiento_resultado)

score = grid.best_score_ * 100
mejores_parametros = grid.best_params_
fin = time.strftime("%X")

print("Tiempo: {} --- {} \n Precision: {:.2f} \n Parametros = {}".format(inicio,fin,score,mejores_parametros))

In [24]:
# Vario unicamente la cantidad de estimadores
boosting = ['goss'] # rf no se tiene en cuenta pues levanta excepcion
estimators = [100, 1000, 5000]
learning_rate = [0.5, 0.9]

parametros = {"n_estimators" : estimators, "learning_rate" : learning_rate, "boosting_type" : boosting}

iteraciones_cross_validation = 25
lgbm = lgb.LGBMRegressor()

In [25]:
inicio = time.strftime("%X")

grid = GridSearchCV( estimator = lgbm, param_grid = parametros, n_jobs = -1, cv = iteraciones_cross_validation)

grid.fit(set_entrenamiento_datos, set_entrenamiento_resultado)

score = grid.best_score_ * 100
mejores_parametros = grid.best_params_
fin = time.strftime("%X")

print("Tiempo: {} --- {} \n Precision: {:.2f} \n Parametros = {}".format(inicio,fin,score,mejores_parametros))

Tiempo: 12:35:18 --- 19:20:59 
 Precision: 41.32 
 Parametros = {'boosting_type': 'goss', 'learning_rate': 0.5, 'n_estimators': 5000}


In [27]:
analizar = pd.read_csv("../properati_dataset_modificado.csv")

analizar.loc[:,'price_usd'] = analizar.loc[:,columnas].apply(lambda x: grid.predict([x])[0],axis = 1)

analizar.price_usd.describe()

count     14166.000000
mean     233809.103726
std      161628.517309
min       52303.525426
25%      116187.346107
50%      162430.635225
75%      314994.565201
max      753801.382929
Name: price_usd, dtype: float64

In [28]:
resultado = analizar.loc[:,['id','price_usd']]

resultado.to_csv('resultados/Lightgbm_GridSearch.csv', index = False)

joblib.dump(grid, 'algoritmos/Lightgbm.pkl')

['algoritmos/Lightgbm.pkl']

# Hago una especie de cross validation con el mejor algoritmo

In [None]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull())\
                              & (propiedades.lat.notnull()) & (propiedades.lon.notnull()) & (propiedades.Year >=2016)]

columnas = ['superficie','lat', 'lon','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

set_pruebas = analizar = pd.read_csv("../properati_dataset_modificado.csv")
set_pruebas.loc[:,'price_usd'] = 0.0

In [None]:
cant = 10

for i in range(cant):
    datos = propiedades.sample(frac = 0.4)
    set_entrenamiento_datos = datos.loc[:,columnas]
    set_entrenamiento_resultado = datos.loc[:,'price_aprox_usd']
    
    lgbm = lgb.LGBMRegressor() # Mejor resultado
    
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'res'] = set_pruebas.loc[:,columnas].apply(lambda x: lgbm.predict(x)[0],axis = 1)
    set_pruebas.loc[:,'price_usd'] = set_pruebas.loc[:,'price_usd'] + set_pruebas.loc[:,'res']

In [None]:
set_pruebas.loc[:,'price_usd'] = set_pruebas.loc[:,'price_usd'] / cant

resultado = set_pruebas.loc[:,['id','price_usd']]

resultado.to_csv('resultados/Lightgbm_Cross_Validation.csv', index = False)

In [7]:
inicio = time.strftime("%X")

grid = GridSearchCV( estimator = lgbm, param_grid = parametros, n_jobs = -1, cv = iteraciones_cross_validation)

grid.fit(set_entrenamiento_datos, set_entrenamiento_resultado)

score = grid.best_score_ * 100
mejores_parametros = grid.best_params_
fin = time.strftime("%X")

print("Tiempo: {} --- {} \n Precision: {:.2f} \n Parametros = {}".format(inicio,fin,score,mejores_parametros))

JoblibLightGBMError: JoblibLightGBMError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\Users\Usuario\Anaconda3\lib\runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    179         sys.exit(msg)
    180     main_globals = sys.modules["__main__"].__dict__
    181     if alter_argv:
    182         sys.argv[0] = mod_spec.origin
    183     return _run_code(code, main_globals, None,
--> 184                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py')
    185 
    186 def run_module(mod_name, init_globals=None,
    187                run_name=None, alter_sys=False):
    188     """Execute a module's code without importing it

...........................................................................
C:\Users\Usuario\Anaconda3\lib\runpy.py in _run_code(code=<code object <module> at 0x000000000128B930, fil...lib\site-packages\ipykernel\__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\__pycache__\__main__.cpython-35.pyc', '__doc__': None, '__file__': r'C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\U...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x000000000128B930, fil...lib\site-packages\ipykernel\__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\__pycache__\__main__.cpython-35.pyc', '__doc__': None, '__file__': r'C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...da3\\lib\\site-packages\\ipykernel\\__main__.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\U...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\__main__.py in <module>()
      1 if __name__ == '__main__':
      2     from ipykernel import kernelapp as app
----> 3     app.launch_new_instance()

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    648 
    649         If a global instance already exists, this reinitializes and starts it
    650         """
    651         app = cls.instance(**kwargs)
    652         app.initialize(argv)
--> 653         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    654 
    655 #-----------------------------------------------------------------------------
    656 # utility functions, for convenience
    657 #-----------------------------------------------------------------------------

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    469             return self.subapp.start()
    470         if self.poller is not None:
    471             self.poller.start()
    472         self.kernel.start()
    473         try:
--> 474             ioloop.IOLoop.instance().start()
    475         except KeyboardInterrupt:
    476             pass
    477 
    478 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\tornado\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    882                 self._events.update(event_pairs)
    883                 while self._events:
    884                     fd, events = self._events.popitem()
    885                     try:
    886                         fd_obj, handler_func = self._handlers[fd]
--> 887                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    888                     except (OSError, IOError) as e:
    889                         if errno_from_exception(e) == errno.EPIPE:
    890                             # Happens when the client closes the connection
    891                             pass

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2017-10-29T12:52:10.563776', 'msg_id': '05455D3843714B4B90FB773940CD6149', 'msg_type': 'execute_request', 'session': '7D3FC2E10E5C48FDBF6C3316C2BBF837', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '05455D3843714B4B90FB773940CD6149', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'7D3FC2E10E5C48FDBF6C3316C2BBF837']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2017-10-29T12:52:10.563776', 'msg_id': '05455D3843714B4B90FB773940CD6149', 'msg_type': 'execute_request', 'session': '7D3FC2E10E5C48FDBF6C3316C2BBF837', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '05455D3843714B4B90FB773940CD6149', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'7D3FC2E10E5C48FDBF6C3316C2BBF837'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2017-10-29T12:52:10.563776', 'msg_id': '05455D3843714B4B90FB773940CD6149', 'msg_type': 'execute_request', 'session': '7D3FC2E10E5C48FDBF6C3316C2BBF837', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '05455D3843714B4B90FB773940CD6149', 'msg_type': 'execute_request', 'parent_header': {}})
    385         if not silent:
    386             self.execution_count += 1
    387             self._publish_execute_input(code, parent, self.execution_count)
    388 
    389         reply_content = self.do_execute(code, silent, store_history,
--> 390                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    391 
    392         # Flush output before sending the reply.
    393         sys.stdout.flush()
    394         sys.stderr.flush()

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))'
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))',), **kwargs={'silent': False, 'store_history': True})
    496             )
    497         self.payload_manager.write_payload(payload)
    498 
    499     def run_cell(self, *args, **kwargs):
    500         self._last_traceback = None
--> 501         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))',)
        kwargs = {'silent': False, 'store_history': True}
    502 
    503     def _showtraceback(self, etype, evalue, stb):
    504         # try to preserve ordering of tracebacks and print statements
    505         sys.stdout.flush()

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))', store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-7-febf5e4d777f>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7aed358, execution_co..._before_exec=None error_in_exec=None result=None>)
   2816 
   2817         try:
   2818             for i, node in enumerate(to_run_exec):
   2819                 mod = ast.Module([node])
   2820                 code = compiler(mod, cell_name, "exec")
-> 2821                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x0000000004BE7780, file "<ipython-input-7-febf5e4d777f>", line 5>
        result = <ExecutionResult object at 7aed358, execution_co..._before_exec=None error_in_exec=None result=None>
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x0000000004BE7780, file "<ipython-input-7-febf5e4d777f>", line 5>, result=<ExecutionResult object at 7aed358, execution_co..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x0000000004BE7780, file "<ipython-input-7-febf5e4d777f>", line 5>
        self.user_global_ns = {'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nfrom skle....readthedocs.io/en/latest/Installation-Guide.html', 'import pandas as pd\nimport numpy as np\nfrom skle....readthedocs.io/en/latest/Installation-Guide.html', "#propiedades = pd.read_csv('/home/agustin/Escrit... = pd.read_csv('../../set_datos_propiedades.csv')", 'propiedades.head()', "propiedades = propiedades.loc[(propiedades.price..._resultado = propiedades.loc[:,'price_aprox_usd']", "boosting = ['gbdt', 'dart', 'goss', 'rf']\nestima..._cross_validation = 25\nlgbm = lgb.LGBMRegressor()", 'inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))'], 'Out': {4:   description  expenses fecha_de_publicacion    ... 
3                 465  
4                 629  }, '_':   description  expenses fecha_de_publicacion    ... 
3                 465  
4                 629  , '_4':   description  expenses fecha_de_publicacion    ... 
3                 465  
4                 629  , '__': '', '___': '', '__builtin__': <module 'builtins' (built-in)>, '__builtins__': <module 'builtins' (built-in)>, '__doc__': 'Automatically created module for IPython interactive environment', ...}
        self.user_ns = {'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nfrom skle....readthedocs.io/en/latest/Installation-Guide.html', 'import pandas as pd\nimport numpy as np\nfrom skle....readthedocs.io/en/latest/Installation-Guide.html', "#propiedades = pd.read_csv('/home/agustin/Escrit... = pd.read_csv('../../set_datos_propiedades.csv')", 'propiedades.head()', "propiedades = propiedades.loc[(propiedades.price..._resultado = propiedades.loc[:,'price_aprox_usd']", "boosting = ['gbdt', 'dart', 'goss', 'rf']\nestima..._cross_validation = 25\nlgbm = lgb.LGBMRegressor()", 'inicio = time.strftime("%X")\n\ngrid = GridSearchC... {}".format(inicio,fin,score,mejores_parametros))'], 'Out': {4:   description  expenses fecha_de_publicacion    ... 
3                 465  
4                 629  }, '_':   description  expenses fecha_de_publicacion    ... 
3                 465  
4                 629  , '_4':   description  expenses fecha_de_publicacion    ... 
3                 465  
4                 629  , '__': '', '___': '', '__builtin__': <module 'builtins' (built-in)>, '__builtins__': <module 'builtins' (built-in)>, '__doc__': 'Automatically created module for IPython interactive environment', ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
C:\Users\Usuario\Desktop\Organizacion de datos\tpDatos\tp2\<ipython-input-7-febf5e4d777f> in <module>()
      1 inicio = time.strftime("%X")
      2 
      3 grid = GridSearchCV( estimator = lgbm, param_grid = parametros, n_jobs = -1, cv = iteraciones_cross_validation)
      4 
----> 5 grid.fit(set_entrenamiento_datos, set_entrenamiento_resultado)
      6 
      7 score = grid.best_score_ * 100
      8 mejores_parametros = grid.best_params_
      9 fin = time.strftime("%X")
     10 

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self=GridSearchCV(cv=25, error_score='raise',
       ...train_score=True,
       scoring=None, verbose=0), X=         superficie        lat        lon  prope... True   True   False  

[519031 rows x 9 columns], y=677740     490000.00
677741     810000.00
677742...   87000.00
Name: price_aprox_usd, dtype: float64, groups=None, **fit_params={})
    633                                   return_train_score=self.return_train_score,
    634                                   return_n_test_samples=True,
    635                                   return_times=True, return_parameters=False,
    636                                   error_score=self.error_score)
    637           for parameters, (train, test) in product(candidate_params,
--> 638                                                    cv.split(X, y, groups)))
        cv.split = <bound method _BaseKFold.split of KFold(n_splits=25, random_state=None, shuffle=False)>
        X =          superficie        lat        lon  prope... True   True   False  

[519031 rows x 9 columns]
        y = 677740     490000.00
677741     810000.00
677742...   87000.00
Name: price_aprox_usd, dtype: float64
        groups = None
    639 
    640         # if one choose to see train score, "out" will contain train score info
    641         if self.return_train_score:
    642             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
LightGBMError                                      Sun Oct 29 14:12:09 2017
PID: 7424               Python 3.5.2: C:\Users\Usuario\Anaconda3\python.exe
...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (LGBMRegressor(boosting_type='rf', colsample_bytr...subsample_for_bin=50000,
       subsample_freq=1),          superficie        lat        lon  prope... True   True   False  

[519031 rows x 9 columns], 677740     490000.00
677741     810000.00
677742...   87000.00
Name: price_aprox_usd, dtype: float64, {'score': <function _passthrough_scorer>}, memmap([ 20762,  20763,  20764, ..., 519028, 519029, 519030]), array([    0,     1,     2, ..., 20759, 20760, 20761]), 0, {'boosting_type': 'rf', 'learning_rate': 0.1, 'n_estimators': 10}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (LGBMRegressor(boosting_type='rf', colsample_bytr...subsample_for_bin=50000,
       subsample_freq=1),          superficie        lat        lon  prope... True   True   False  

[519031 rows x 9 columns], 677740     490000.00
677741     810000.00
677742...   87000.00
Name: price_aprox_usd, dtype: float64, {'score': <function _passthrough_scorer>}, memmap([ 20762,  20763,  20764, ..., 519028, 519029, 519030]), array([    0,     1,     2, ..., 20759, 20760, 20761]), 0, {'boosting_type': 'rf', 'learning_rate': 0.1, 'n_estimators': 10})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator=LGBMRegressor(boosting_type='rf', colsample_bytr...subsample_for_bin=50000,
       subsample_freq=1), X=         superficie        lat        lon  prope... True   True   False  

[519031 rows x 9 columns], y=677740     490000.00
677741     810000.00
677742...   87000.00
Name: price_aprox_usd, dtype: float64, scorer={'score': <function _passthrough_scorer>}, train=memmap([ 20762,  20763,  20764, ..., 519028, 519029, 519030]), test=array([    0,     1,     2, ..., 20759, 20760, 20761]), verbose=0, parameters={'boosting_type': 'rf', 'learning_rate': 0.1, 'n_estimators': 10}, fit_params={}, return_train_score=True, return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    432 
    433     try:
    434         if y_train is None:
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method LGBMRegressor.fit of LGBMRegressor...ubsample_for_bin=50000,
       subsample_freq=1)>
        X_train =          superficie        lat        lon  prope... True   True   False  

[498269 rows x 9 columns]
        y_train = 708612     415000.00
708613     155000.00
708614...   87000.00
Name: price_aprox_usd, dtype: float64
        fit_params = {}
    438 
    439     except Exception as e:
    440         # Note fit time as time until error
    441         fit_time = time.time() - start_time

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\lightgbm\sklearn.py in fit(self=LGBMRegressor(boosting_type='rf', colsample_bytr...subsample_for_bin=50000,
       subsample_freq=1), X=         superficie        lat        lon  prope... True   True   False  

[498269 rows x 9 columns], y=708612     415000.00
708613     155000.00
708614...   87000.00
Name: price_aprox_usd, dtype: float64, sample_weight=None, init_score=None, eval_set=None, eval_names=None, eval_sample_weight=None, eval_init_score=None, eval_metric='l2', early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
    607                                        eval_init_score=eval_init_score,
    608                                        eval_metric=eval_metric,
    609                                        early_stopping_rounds=early_stopping_rounds,
    610                                        verbose=verbose, feature_name=feature_name,
    611                                        categorical_feature=categorical_feature,
--> 612                                        callbacks=callbacks)
        callbacks = None
    613         return self
    614 
    615     base_doc = LGBMModel.fit.__doc__
    616     fit.__doc__ = (base_doc[:base_doc.find('eval_metric :')] +

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\lightgbm\sklearn.py in fit(self=LGBMRegressor(boosting_type='rf', colsample_bytr...subsample_for_bin=50000,
       subsample_freq=1), X=         superficie        lat        lon  prope... True   True   False  

[498269 rows x 9 columns], y=708612     415000.00
708613     155000.00
708614...   87000.00
Name: price_aprox_usd, dtype: float64, sample_weight=None, init_score=None, group=None, eval_set=None, eval_names=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='l2', early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
    454                               self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
    455                               early_stopping_rounds=early_stopping_rounds,
    456                               evals_result=evals_result, fobj=self._fobj, feval=feval,
    457                               verbose_eval=verbose, feature_name=feature_name,
    458                               categorical_feature=categorical_feature,
--> 459                               callbacks=callbacks)
        callbacks = None
    460 
    461         if evals_result:
    462             self._evals_result = evals_result
    463 

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\lightgbm\engine.py in train(params={'boosting_type': 'rf', 'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': -1, 'metric': 'l2', 'min_child_samples': 10, 'min_child_weight': 5, 'min_split_gain': 0.0, 'nthread': -1, ...}, train_set=<lightgbm.basic.Dataset object>, num_boost_round=10, valid_sets=[], valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result={}, verbose_eval=True, learning_rates=None, keep_training_booster=False, callbacks={<function print_evaluation.<locals>.callback>, <function record_evaluation.<locals>.callback>})
    173     callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
    174     callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
    175 
    176     """construct booster"""
    177     try:
--> 178         booster = Booster(params=params, train_set=train_set)
        booster = undefined
        params = {'boosting_type': 'rf', 'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': -1, 'metric': 'l2', 'min_child_samples': 10, 'min_child_weight': 5, 'min_split_gain': 0.0, 'nthread': -1, ...}
        train_set = <lightgbm.basic.Dataset object>
    179         if is_valid_contain_train:
    180             booster.set_train_data_name(train_data_name)
    181         for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
    182             booster.add_valid(valid_set, name_valid_set)

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\lightgbm\basic.py in __init__(self=<lightgbm.basic.Booster object>, params={'boosting_type': 'rf', 'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': -1, 'metric': 'l2', 'min_child_samples': 10, 'min_child_weight': 5, 'min_split_gain': 0.0, 'nthread': -1, ...}, train_set=<lightgbm.basic.Dataset object>, model_file=None, silent=False)
   1265             """construct booster object"""
   1266             self.handle = ctypes.c_void_p()
   1267             _safe_call(_LIB.LGBM_BoosterCreate(
   1268                 train_set.construct().handle,
   1269                 c_str(params_str),
-> 1270                 ctypes.byref(self.handle)))
        self.handle = c_void_p(None)
   1271             """save reference to data"""
   1272             self.train_set = train_set
   1273             self.valid_sets = []
   1274             self.name_valid_sets = []

...........................................................................
C:\Users\Usuario\Anaconda3\lib\site-packages\lightgbm\basic.py in _safe_call(ret=-1)
     43     ----------
     44     ret : int
     45         return value from API calls
     46     """
     47     if ret != 0:
---> 48         raise LightGBMError(_LIB.LGBM_GetLastError())
     49 
     50 
     51 def is_numeric(obj):
     52     """Check is a number or not, include numpy number etc."""

LightGBMError: b'Check failed: config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f at c:\\projects\\lightgbm\\python-package\\compile\\src\\boosting\\rf.hpp, line 29 .\n'
___________________________________________________________________________