In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)
#https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html

In [2]:
propiedades = pd.read_csv('/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/datos para el tp2/set_datos_propiedades.csv')

In [3]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull()),\
                             ['place_name_encoded', 'property_type_encoded','price_aprox_usd','superficie',\
                             'Year','Month','seguridad','aire','gimnasio','cochera','pileta']]

In [4]:
propiedades.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1132495 entries, 0 to 1413024
Data columns (total 11 columns):
place_name_encoded       1132495 non-null int64
property_type_encoded    1132495 non-null int64
price_aprox_usd          1132495 non-null float64
superficie               1132495 non-null float64
Year                     1132495 non-null int64
Month                    1132495 non-null int64
seguridad                1132495 non-null bool
aire                     1132495 non-null bool
gimnasio                 1132495 non-null bool
cochera                  1132495 non-null bool
pileta                   1132495 non-null bool
dtypes: bool(5), float64(2), int64(4)
memory usage: 65.9 MB


# LightGBM

In [26]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [27]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [29]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
print("Precision = {}%, error = {}".format(precision, error))

Precision = 0.230676213694%, error = 1.37641955262e+11


### Ahora que tenemos una intuicion, probamos variando los parametros

In [30]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [33]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

res = []

In [34]:
boosting = ['gbdt', 'dart', 'goss', 'rf']
estimators = [10, 20, 50]
learning_rate = [0.1, 0.3, 0.5]

for b in boosting:
    for e in estimators:
        for lr in learning_rate:
            try:
                lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = e, learning_rate = lr)
                lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
                set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
                precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
                error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
                res.append((b, e, lr, precision, error))
                print(b, '-', e, '-', lr)
            except:
                continue

('gbdt', '-', 10, '-', 0.1)
('gbdt', '-', 10, '-', 0.3)
('gbdt', '-', 10, '-', 0.5)
('gbdt', '-', 20, '-', 0.1)
('gbdt', '-', 20, '-', 0.3)
('gbdt', '-', 20, '-', 0.5)
('gbdt', '-', 50, '-', 0.1)
('gbdt', '-', 50, '-', 0.3)
('gbdt', '-', 50, '-', 0.5)
('dart', '-', 10, '-', 0.1)
('dart', '-', 10, '-', 0.3)
('dart', '-', 10, '-', 0.5)
('dart', '-', 20, '-', 0.1)
('dart', '-', 20, '-', 0.3)
('dart', '-', 20, '-', 0.5)
('dart', '-', 50, '-', 0.1)
('dart', '-', 50, '-', 0.3)
('dart', '-', 50, '-', 0.5)
('goss', '-', 10, '-', 0.1)
('goss', '-', 10, '-', 0.3)
('goss', '-', 10, '-', 0.5)
('goss', '-', 20, '-', 0.1)
('goss', '-', 20, '-', 0.3)
('goss', '-', 20, '-', 0.5)
('goss', '-', 50, '-', 0.1)
('goss', '-', 50, '-', 0.3)
('goss', '-', 50, '-', 0.5)


In [36]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.1, precision = 0.23 % , error = 1.37641955262e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.3, precision = 0.23 % , error = 1.37641955262e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.5, precision = 0.23 % , error = 1.37641955262e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.1, precision = 0.46 % , error = 1.37318841383e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.3, precision = 0.46 % , error = 1.37318841383e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.5, precision = 0.46 % , error = 1.37318841383e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.17 % , error = 1.36348628327e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.3, precision = 1.17 % , error = 1.3634861732e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.5, precision = 1.17 % , error = 1.36348603281e+11
boosting_type = dart, n_estim

In [37]:
min_error = float('inf')
max_precision = 0
tupla_min_error = ()
tupla_max_precision = ()
for r in res:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
    if r[3] > max_precision:
        max_precision = r[3]
        tupla_max_precision = r
        
print("Mayor precision = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_max_precision[0],tupla_max_precision[1],tupla_max_precision[2],tupla_max_precision[3], tupla_max_precision[4]))
print("Menor error = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

Mayor precision = boosting_type = goss, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11
Menor error = boosting_type = goss, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11


# Modifico max_depth

In [38]:
columnas = ['superficie','place_name_encoded','property_type_encoded','seguridad','gimnasio','aire','pileta','cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [39]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [40]:
max_depth = [1, 5, 10, 15]

In [41]:
for d in max_depth:
    lgbm = lgb.LGBMRegressor(boosting_type = 'goss', n_estimators = 50, learning_rate = 0.5, max_depth = d)
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
    precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
    error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
    res.append((b, e, lr, precision, error))
    print(b, '-', e, '-', lr)

('rf', '-', 50, '-', 0.5)
('rf', '-', 50, '-', 0.5)
('rf', '-', 50, '-', 0.5)
('rf', '-', 50, '-', 0.5)


In [42]:
for r in res[27:]:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.09 % , error = 1.36452039915e+11
boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.18 % , error = 1.36326866928e+11
boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11
boosting_type = rf, n_estimators = 50, learnig_rate = 0.5, precision = 1.19 % , error = 1.36317175083e+11


## Aumentamos los valores de max depth y n_estimators para ver si mejoramos la precisión

In [47]:
lgbm = lgb.LGBMRegressor(boosting_type = 'goss', n_estimators = 5000, learning_rate = 2.9, max_depth = 1000)
lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
print("Precision = {}%, error = {}".format(precision, error))

Precision = 38.9816250348%, error = 84181070076.1


In [48]:
n_estimators = [5000, 10000, 50000, 100000]
b = 'goss'
lr = 2.9
d = 1000
res = []

for n in n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = n, learning_rate = lr, max_depth = d)
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
    precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
    error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
    res.append((b, n, lr, precision, error))
    print(n)

5000
10000
50000
100000


In [52]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = goss, n_estimators = 5000, learnig_rate = 2.9, precision = 38.98 % , error = 84181070076.1
boosting_type = goss, n_estimators = 10000, learnig_rate = 2.9, precision = 47.01 % , error = 73107356570.7
boosting_type = goss, n_estimators = 50000, learnig_rate = 2.9, precision = 54.98 % , error = 62112893806.8
boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 58.98 % , error = 56587427148.9


#### Se puede ver que al aumentar la cantidad de estimadores mejora la precision por lo que tomamos un número más grande

# Calculamos los verdaderos datos

In [53]:
analizar = pd.read_csv("/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/tpDatos/properati_dataset_modificado.csv")

In [55]:
analizar.loc[:,'price_usd'] = analizar.loc[:,columnas].apply(lambda x: lgbm.predict([x])[0],axis = 1)

In [56]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.471407e+05
std      2.493879e+05
min     -5.653215e+05
25%      1.145887e+05
50%      1.621351e+05
75%      2.909809e+05
max      8.571289e+06
Name: price_usd, dtype: float64

In [57]:
analizar.loc[:, 'price_usd'] = analizar.loc[:, 'price_usd'].apply(lambda x: abs(x))

In [58]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.477323e+05
std      2.488002e+05
min      6.880918e+03
25%      1.147296e+05
50%      1.624316e+05
75%      2.914919e+05
max      8.571289e+06
Name: price_usd, dtype: float64

In [59]:
resultado = analizar.loc[:,['id','price_usd']]

In [60]:
resultado.to_csv('resultados/lightgbm_resultados.csv', index = False)

## Ahora uso lat y lon en lugar de place name

In [3]:
propiedades = propiedades.loc[(propiedades.price_aprox_usd.notnull()) & (propiedades.superficie.notnull())\
                              & (propiedades.lat.notnull()) & (propiedades.lon.notnull()),\
                             ['lat', 'lon', 'property_type_encoded','price_aprox_usd','superficie',\
                             'Year','Month','seguridad','aire','gimnasio','cochera','pileta']]

In [4]:
columnas = ['superficie','lat', 'lon','property_type_encoded','seguridad','gimnasio', 'aire', 'pileta', 'cochera']
columnas_precio = columnas + ['price_aprox_usd']

In [5]:
set_entrenamiento = propiedades.loc[(propiedades.Year >= 2016) &((propiedades.Year < 2017) | (propiedades.Month < 6))\
                                    ,columnas_precio]
set_pruebas = propiedades.loc[(propiedades.Year == 2017) & (propiedades.Month == 6),columnas_precio].head(20000)

set_entrenamiento_datos = set_entrenamiento.loc[:,columnas]
set_entrenamiento_resultado = set_entrenamiento.loc[:,'price_aprox_usd']

In [66]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
print("Precision = {}%, error = {}".format(precision, error))

Precision = 0.225789592842%, error = 1.15522485637e+11


In [67]:
res = []

In [68]:
boosting = ['gbdt', 'dart', 'goss', 'rf']
estimators = [10, 20, 50]
learning_rate = [0.1, 0.3, 0.5]

for b in boosting:
    for e in estimators:
        for lr in learning_rate:
            try:
                lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = e, learning_rate = lr)
                lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
                set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
                precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
                error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
                res.append((b, e, lr, precision, error))
                print(b, '-', e, '-', lr)
            except:
                continue

('gbdt', '-', 10, '-', 0.1)
('gbdt', '-', 10, '-', 0.3)
('gbdt', '-', 10, '-', 0.5)
('gbdt', '-', 20, '-', 0.1)
('gbdt', '-', 20, '-', 0.3)
('gbdt', '-', 20, '-', 0.5)
('gbdt', '-', 50, '-', 0.1)
('gbdt', '-', 50, '-', 0.3)
('gbdt', '-', 50, '-', 0.5)
('dart', '-', 10, '-', 0.1)
('dart', '-', 10, '-', 0.3)
('dart', '-', 10, '-', 0.5)
('dart', '-', 20, '-', 0.1)
('dart', '-', 20, '-', 0.3)
('dart', '-', 20, '-', 0.5)
('dart', '-', 50, '-', 0.1)
('dart', '-', 50, '-', 0.3)
('dart', '-', 50, '-', 0.5)
('goss', '-', 10, '-', 0.1)
('goss', '-', 10, '-', 0.3)
('goss', '-', 10, '-', 0.5)
('goss', '-', 20, '-', 0.1)
('goss', '-', 20, '-', 0.3)
('goss', '-', 20, '-', 0.5)
('goss', '-', 50, '-', 0.1)
('goss', '-', 50, '-', 0.3)
('goss', '-', 50, '-', 0.5)


In [69]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.1, precision = 0.23 % , error = 1.15522485637e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.3, precision = 0.23 % , error = 1.15522485637e+11
boosting_type = gbdt, n_estimators = 10, learnig_rate = 0.5, precision = 0.23 % , error = 1.15522485637e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.1, precision = 0.48 % , error = 1.15225738506e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.3, precision = 0.48 % , error = 1.15225738506e+11
boosting_type = gbdt, n_estimators = 20, learnig_rate = 0.5, precision = 0.48 % , error = 1.15225738506e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.24 % , error = 1.14345223916e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.3, precision = 1.24 % , error = 1.14345223916e+11
boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.5, precision = 1.24 % , error = 1.14345223916e+11
boosting_type = dart, n_esti

In [70]:
min_error = float('inf')
max_precision = 0
tupla_min_error = ()
tupla_max_precision = ()
for r in res:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
    if r[3] > max_precision:
        max_precision = r[3]
        tupla_max_precision = r
        
print("Mayor precision = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_max_precision[0],tupla_max_precision[1],tupla_max_precision[2],tupla_max_precision[3], tupla_max_precision[4]))
print("Menor error = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

Mayor precision = boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.24 % , error = 1.14345223916e+11
Menor error = boosting_type = gbdt, n_estimators = 50, learnig_rate = 0.1, precision = 1.24 % , error = 1.14345223916e+11


In [71]:
n_estimators = [5000, 10000, 50000, 100000]
b = 'goss'
lr = 2.9
res = []

for n in n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type = b, n_estimators = n, learning_rate = lr)
    lgbm.fit(set_entrenamiento_datos,set_entrenamiento_resultado)
    set_pruebas.loc[:,'resultado'] = lgbm.predict(set_pruebas.loc[:,columnas])
    precision = lgbm.score(set_pruebas.loc[:,columnas],set_pruebas.price_aprox_usd) * 100
    error = mean_squared_error(set_pruebas.price_aprox_usd,set_pruebas.resultado)
    res.append((b, n, lr, precision, error))
    print(n)

5000
10000
50000
100000


In [72]:
for r in res:
    print ("boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".format(r[0],r[1],r[2],r[3], r[4]))

boosting_type = goss, n_estimators = 5000, learnig_rate = 2.9, precision = 39.90 % , error = 69586432759.9
boosting_type = goss, n_estimators = 10000, learnig_rate = 2.9, precision = 49.19 % , error = 58834615136.1
boosting_type = goss, n_estimators = 50000, learnig_rate = 2.9, precision = 62.36 % , error = 43585312292.7
boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 68.84 % , error = 36082874388.6


In [73]:
min_error = float('inf')
max_precision = 0
tupla_min_error = ()
tupla_max_precision = ()
for r in res:
    if r[4] < min_error:
        min_error = r[4]
        tupla_min_error = r
    if r[3] > max_precision:
        max_precision = r[3]
        tupla_max_precision = r
        
print("Mayor precision = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_max_precision[0],tupla_max_precision[1],tupla_max_precision[2],tupla_max_precision[3], tupla_max_precision[4]))
print("Menor error = boosting_type = {}, n_estimators = {}, learnig_rate = {}, precision = {:.2f} % , error = {}".\
              format(tupla_min_error[0],tupla_min_error[1],tupla_min_error[2],tupla_min_error[3], tupla_min_error[4]))

Mayor precision = boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 68.84 % , error = 36082874388.6
Menor error = boosting_type = goss, n_estimators = 100000, learnig_rate = 2.9, precision = 68.84 % , error = 36082874388.6


# Calculamos los verdaderos datos

In [74]:
analizar = pd.read_csv("/home/agustin/Escritorio/escritorio/fiuba/Organizacion de datos/tpDatos/properati_dataset_modificado.csv")

In [75]:
analizar.loc[:,'price_usd'] = analizar.loc[:,columnas].apply(lambda x: lgbm.predict([x])[0],axis = 1)

In [76]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.389744e+05
std      2.538136e+05
min     -1.128486e+06
25%      1.092698e+05
50%      1.590292e+05
75%      2.817779e+05
max      5.876935e+06
Name: price_usd, dtype: float64

In [77]:
analizar.loc[:, 'price_usd'] = analizar.loc[:, 'price_usd'].apply(lambda x: abs(x))

In [78]:
analizar.price_usd.describe()

count    1.416600e+04
mean     2.396352e+05
std      2.531898e+05
min      3.814273e+03
25%      1.093145e+05
50%      1.591093e+05
75%      2.818319e+05
max      5.876935e+06
Name: price_usd, dtype: float64

In [79]:
resultado = analizar.loc[:,['id','price_usd']]

In [80]:
resultado.to_csv('resultados/lightgbm_latlon_resultados.csv', index = False)