In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

In [29]:
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)
data = pd.read_csv("data_set.csv", low_memory = False)

In [40]:
data.columns

Index(['id', 'days_to_today', 'ano', 'antiguedad',
       'centroscomercialescercanos', 'escuelascercanas', 'habitaciones',
       'garages', 'banos', 'anio', 'Apartamento', 'Bodega comercial', 'Casa',
       'Casa en condominio', 'Casa uso de suelo', 'Departamento Compartido',
       'Duplex', 'Edificio', 'Garage', 'Hospedaje', 'Huerta',
       'Inmuebles productivos urbanos', 'Local Comercial',
       'Local en centro comercial', 'Lote', 'Nave industrial',
       'Oficina comercial', 'Otros', 'Quinta Vacacional', 'Rancho', 'Terreno',
       'Terreno comercial', 'Terreno industrial', 'Villa', 'gimnasio',
       'usosmultiples', 'piscina', 'qty_of_extras', 'metroscubiertos',
       'metrostotales', 'popular_desc_1', 'popular_desc_2', 'popular_desc_3',
       'popular_desc_4', 'idzona', 'lat', 'lng', 'avenue', 'Aguascalientes',
       'Baja California Norte', 'Baja California Sur', 'Campeche', 'Chiapas',
       'Chihuahua', 'Coahuila', 'Colima', 'Distrito Federal', 'Durango',
       'Ed

In [41]:
data.rename(columns={'año':'ano', 'Yucatán':'Yucatan', 'Querétaro':'Queretaro','Michoacán':'Michoacan',
                     'Nuevo León':'Nuevo Leon',
                     'San luis Potosí':'San luis Potosi', 'Edo. de México':'Edo. de Mexico'}, inplace=True)

In [42]:
train_df = train_df[['id','precio']]
train_df['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [43]:
features = pd.merge(train_df, data, on='id', how='inner')
features = features.fillna(0)
features['precio'].value_counts()

1500000.0    3041
2500000.0    3036
1200000.0    2909
3500000.0    2726
1300000.0    2480
             ... 
2704600.0       1
1352250.0       1
938260.0        1
2704460.0       1
2327000.0       1
Name: precio, Length: 15520, dtype: int64

In [44]:
labels = features['precio']
features = features.drop(['id','precio'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)
print("Train: ",len(x_train),"Test: ",len(x_test))

Train:  180000 Test:  60000


In [45]:
train_data=lgb.Dataset(x_train, label=y_train)

In [69]:
#Select Hyper-Parameters
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'nthread': 5,
          'num_leaves': 64,
          'learning_rate': 0.07,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 1.2,
          'reg_lambda': 1.2,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1
          }


In [70]:
# Create parameters to search
gridParams = {
    'learning_rate': [0.07],
    'n_estimators': [8,16],
    'num_leaves': [20, 24, 27],
    'boosting_type' : ['gbdt'],
    'random_state' : [501], 
    'colsample_bytree' : [0.64, 0.65],
    'subsample' : [0.7,0.75],
    #'reg_alpha' : [1, 1.2],
    #'reg_lambda' : [ 1.2, 1.4],
    }

In [71]:
# Create classifier to use
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt',
          objective = 'regression',
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

In [72]:
grid = GridSearchCV(mdl, gridParams, verbose=2, cv=4, n_jobs=-1)
grid.fit(x_train, y_train)

Fitting 4 folds for each of 24 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:   26.2s finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_bin=512, max_depth=-1,
                                     min_child_samples=5, min_child_weight=1,
                                     min_split_gain=0.5, n_estimators=100,
                                     n_jobs=5, num_leaves=31,
                                     objective='regression', random_state=None,
                                     reg_alpha=...
                                     scale_pos_weight=1, silent=True,
                                     subsample=1, subsample_for_bin=200,
                                     subsample_freq=1),
             iid='warn', n_jobs=-1,
             param_grid={'boosting_type': ['gbdt'],
                         'colsamp

In [73]:
# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

{'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.07, 'n_estimators': 16, 'num_leaves': 27, 'random_state': 501, 'subsample': 0.7}
0.5337104851169531


In [74]:
lgbm = lgb.train(params,
                 train_data,
                 280,
                 #early_stopping_rounds= 40,
                 verbose_eval= 4
                 )


In [75]:
pred = lgbm.predict(x_test)


In [76]:
sqrt(mean_absolute_error(y_test,pred))

786.8285405754838

In [77]:
set_test = pd.merge(data, test_df[['id']],on = "id", how = "inner")
ids = set_test["id"]
set_test = set_test.drop(columns=['id'])

In [78]:
set_test = set_test.fillna(0)

In [79]:
set_test.head()

Unnamed: 0,days_to_today,ano,antiguedad,centroscomercialescercanos,escuelascercanas,habitaciones,garages,banos,anio,Apartamento,...,Quintana Roo,San luis Potosi,Sinaloa,Sonora,Tabasco,Tamaulipas,Tlaxcala,Veracruz,Yucatan,Zacatecas
0,2320,2013,29.0,0.0,0.0,3.0,0.0,4.0,2013,0,...,0,0,0,0,0,0,0,0,0,0
1,1494,2015,0.0,0.0,0.0,1.0,1.0,1.0,2015,1,...,0,0,0,0,0,0,0,0,1,0
2,1641,2015,0.0,1.0,0.0,2.0,1.0,2.0,2015,1,...,0,0,0,0,0,0,0,0,0,0
3,1699,2015,2.0,0.0,0.0,2.0,2.0,2.0,2015,1,...,0,0,0,0,0,0,0,0,0,0
4,2294,2013,10.0,1.0,1.0,2.0,1.0,1.0,2013,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
preds_kaggle = lgbm.predict(set_test)

In [81]:
df_kaggel = pd.DataFrame(preds_kaggle)
df_kaggel.head()

Unnamed: 0,0
0,6793130.0
1,876615.7
2,2062642.0
3,1281588.0
4,521954.0


In [82]:
df_kaggel = pd.DataFrame(df_kaggel)                               
df_kaggel["id"] = ids
df_kaggel.rename(columns = {0: 'target'},inplace = True) 
df_kaggel = df_kaggel[['id','target']]

In [83]:
df_kaggel.head()

Unnamed: 0,id,target
0,4941,6793130.0
1,51775,876615.7
2,115253,2062642.0
3,299321,1281588.0
4,173570,521954.0


In [84]:
df_kaggel.to_csv('ResultadosLightGBM.csv',index=False)

In [85]:
df_kaggel.shape

(60000, 2)

In [86]:
df_kaggel.head()

Unnamed: 0,id,target
0,4941,6793130.0
1,51775,876615.7
2,115253,2062642.0
3,299321,1281588.0
4,173570,521954.0
