In [1]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv( "../data/apts.csv")

In [3]:
df = df.fillna(0.0)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ages,Area,Baths,Condition,Floor,Garages,Location3,Stratum,Rooms,FormatedPrice
0,0,1 a 8 años,68,2,Excelente,4º,2,Norte,5.0,1,430000.0
1,1,16 a 30 años,60,1,Bueno,5º,1,Occidente,3.0,3,950000.0
2,2,16 a 30 años,259,5,Bueno,0,3,Norte,6.0,4,1190000.0
3,3,1 a 8 años,175,5,Excelente,2º,2,Norte,6.0,3,1700000.0
4,4,Más de 30 años,429,4,Bueno,0,3,Chapinero,6.0,3,2872000.0


In [5]:
df["Location3"].value_counts()

 Norte           4654
 Noroccidente    2484
 Occidente       1759
 Sur              674
 Chapinero        473
 Centro           191
Name: Location3, dtype: int64

In [6]:
df["Condition"] = df["Condition"].astype( str )
df["Stratum"] = df["Stratum"].astype( int ).astype( str )
df["Floor"] = df["Floor"].astype( str )

df["Location3"] = df["Location3"].apply( lambda x : x.replace(" " , ""))
df["Floor"] = df["Floor"].apply( lambda x : x.replace("º" , ""))

In [7]:
df["Area"].max()

48023

In [8]:
def prepare( df ):
    
    encoders = dict()
    transformed = dict()
    
    le = preprocessing.LabelEncoder()
    enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
    #df["Ages"] = le.fit_transform( df["Ages"] )
    df1 = le.fit_transform( df["Ages"] )
    #encoders.append( enc )
    encoders["age"] = le
    transformed["Ages"] = df1 
    
    #df1 = pd.get_dummies( df["Ages"] )
    
    
    le = preprocessing.LabelEncoder()
    enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
    df2 = le.fit_transform( df["Condition"] )
    #encoders.append( enc )
    encoders["condition"] = le
    transformed["Condition"] = df2
    
    le = preprocessing.LabelEncoder()
    enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
    df3 = le.fit_transform( df["Location3"] )
    encoders["location"] = le
    transformed["Location3"] = df3 
    
    
    
    le = preprocessing.LabelEncoder()
    enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
    #df["FloorId"] = le.fit_transform( df["FloorId"] )
    df4 = le.fit_transform( df["Floor"] )
    encoders["floor"] = le
    transformed["Floor"] = df4 
    
    le = preprocessing.LabelEncoder()
    enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
    df5 = le.fit_transform( df["Stratum"] )
    encoders["stratum"] = le
    transformed["Stratum"] = df5
    
    df_transformed = pd.DataFrame( transformed )
    
    ls = [ df_transformed , df["Area"] , df["Baths"]  , df["Rooms"] ]
    
    X = pd.concat( ls , axis = 1 )
    y = np.log( df["FormatedPrice"]/1e6 )
    
    return X.values , y.values , encoders 

In [9]:
X , y , encoders = prepare( df )

In [10]:
X.shape

(10235, 8)

In [11]:
learning_rate = 0.01
num_leaves = 15
min_data_in_leaf = 2000
feature_fraction = 0.8
num_boost_round = 5000
params = {"objective": "regression",
          "boosting_type": "gbdt",
          "learning_rate": learning_rate,
          "metric":["mse"] , 
          "num_leaves": num_leaves,
           "max_bin": 256,
          "feature_fraction": feature_fraction,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9 , 
}

In [12]:
NFOLDS = 5
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=218)


final_cv_train = np.zeros(len(y))
x_score = []

best_trees = []
for s in range(10):
    cv_train = np.zeros(len(y))
    params['seed'] = s
    
    if True:
        kf = kfold.split( X ,   y )
        fold_scores = []
        
        for i, (train_fold, validate) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = X[train_fold, :], X[validate, :], y[train_fold], y[validate]
            dtrain = lgb.Dataset(X_train, label_train , categorical_feature = [ 0 , 1, 2, 3 , 4 ] )
            dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain , categorical_feature = [ 0 , 1, 2, 3 , 4 ])
            bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid , verbose_eval=100,early_stopping_rounds=100)
            best_trees.append(bst)
            
            cv_train[validate] += bst.predict(X_validate)
            
            
            score = mean_squared_error( label_validate, cv_train[validate] )
            print( score )
            fold_scores.append(score)

        final_cv_train += cv_train

        print("cv score:")
        print( mean_squared_error(y, cv_train))
        print( "current score:", mean_squared_error( y , final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        #print(best_trees, np.mean(best_trees))

        x_score.append(mean_squared_error( y , cv_train))



Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.172564
[200]	valid_0's l2: 0.0937846
[300]	valid_0's l2: 0.0767779
[400]	valid_0's l2: 0.0720791
[500]	valid_0's l2: 0.0703134
[600]	valid_0's l2: 0.0693602
[700]	valid_0's l2: 0.0688342
[800]	valid_0's l2: 0.0684545
[900]	valid_0's l2: 0.0682127
[1000]	valid_0's l2: 0.0680571
[1100]	valid_0's l2: 0.0678437
[1200]	valid_0's l2: 0.0677342
[1300]	valid_0's l2: 0.0676183
[1400]	valid_0's l2: 0.0675127
[1500]	valid_0's l2: 0.0673955
[1600]	valid_0's l2: 0.0672363
[1700]	valid_0's l2: 0.0671742
[1800]	valid_0's l2: 0.0670502
[1900]	valid_0's l2: 0.0669296
[2000]	valid_0's l2: 0.0668152
[2100]	valid_0's l2: 0.0666988
[2200]	valid_0's l2: 0.0665933
[2300]	valid_0's l2: 0.0665132
[2400]	valid_0's l2: 0.0664886
[2500]	valid_0's l2: 0.0664967
[2600]	valid_0's l2: 0.0664321
[2700]	valid_0's l2: 0.0664378
Early stopping, best iteration is:
[2646]	valid_0's l2: 0.0664088
0.06640880791373761
Training until validatio

0.14399033343769188
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.12576
[200]	valid_0's l2: 0.0595853
[300]	valid_0's l2: 0.0458711
[400]	valid_0's l2: 0.0421901
[500]	valid_0's l2: 0.0407412
[600]	valid_0's l2: 0.0399982
[700]	valid_0's l2: 0.0394781
[800]	valid_0's l2: 0.0392001
[900]	valid_0's l2: 0.0390167
[1000]	valid_0's l2: 0.0388641
[1100]	valid_0's l2: 0.0387387
[1200]	valid_0's l2: 0.0387071
[1300]	valid_0's l2: 0.0386035
[1400]	valid_0's l2: 0.0385117
[1500]	valid_0's l2: 0.0384829
[1600]	valid_0's l2: 0.0384475
[1700]	valid_0's l2: 0.0384124
[1800]	valid_0's l2: 0.0383797
[1900]	valid_0's l2: 0.0383697
[2000]	valid_0's l2: 0.0383551
[2100]	valid_0's l2: 0.0383383
[2200]	valid_0's l2: 0.0382882
[2300]	valid_0's l2: 0.0382812
[2400]	valid_0's l2: 0.0382428
[2500]	valid_0's l2: 0.0382261
[2600]	valid_0's l2: 0.0382113
[2700]	valid_0's l2: 0.038207
[2800]	valid_0's l2: 0.0381823
[2900]	valid_0's l2: 0.0381707
[3000]	valid_0's l2: 0.0381859

Early stopping, best iteration is:
[4299]	valid_0's l2: 0.065372
0.06537198453674523
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.22587
[200]	valid_0's l2: 0.153904
[300]	valid_0's l2: 0.139476
[400]	valid_0's l2: 0.135607
[500]	valid_0's l2: 0.134203
[600]	valid_0's l2: 0.133414
[700]	valid_0's l2: 0.133059
[800]	valid_0's l2: 0.132982
[900]	valid_0's l2: 0.132845
[1000]	valid_0's l2: 0.132748
[1100]	valid_0's l2: 0.13263
[1200]	valid_0's l2: 0.132585
[1300]	valid_0's l2: 0.132529
[1400]	valid_0's l2: 0.132479
[1500]	valid_0's l2: 0.132495
[1600]	valid_0's l2: 0.132428
[1700]	valid_0's l2: 0.132322
[1800]	valid_0's l2: 0.132314
[1900]	valid_0's l2: 0.132175
[2000]	valid_0's l2: 0.132056
[2100]	valid_0's l2: 0.131963
[2200]	valid_0's l2: 0.131911
[2300]	valid_0's l2: 0.131885
[2400]	valid_0's l2: 0.131846
[2500]	valid_0's l2: 0.1318
[2600]	valid_0's l2: 0.131758
Early stopping, best iteration is:
[2592]	valid_0's l2: 0.131751
0.1317507054567162
T

[900]	valid_0's l2: 0.0685925
[1000]	valid_0's l2: 0.0684166
[1100]	valid_0's l2: 0.0682448
[1200]	valid_0's l2: 0.0680703
[1300]	valid_0's l2: 0.06789
[1400]	valid_0's l2: 0.0677101
[1500]	valid_0's l2: 0.0675578
[1600]	valid_0's l2: 0.0674472
[1700]	valid_0's l2: 0.0673766
[1800]	valid_0's l2: 0.0672467
[1900]	valid_0's l2: 0.0671541
[2000]	valid_0's l2: 0.067074
[2100]	valid_0's l2: 0.0669826
[2200]	valid_0's l2: 0.0668578
[2300]	valid_0's l2: 0.0667379
[2400]	valid_0's l2: 0.0666915
[2500]	valid_0's l2: 0.0666439
[2600]	valid_0's l2: 0.0666404
[2700]	valid_0's l2: 0.0666238
[2800]	valid_0's l2: 0.0665861
[2900]	valid_0's l2: 0.0665413
[3000]	valid_0's l2: 0.0665203
[3100]	valid_0's l2: 0.0665069
[3200]	valid_0's l2: 0.0664822
[3300]	valid_0's l2: 0.0664693
[3400]	valid_0's l2: 0.0664315
[3500]	valid_0's l2: 0.0664224
[3600]	valid_0's l2: 0.0663968
[3700]	valid_0's l2: 0.0663745
[3800]	valid_0's l2: 0.0663886
Early stopping, best iteration is:
[3754]	valid_0's l2: 0.0663616
0.066361

[1700]	valid_0's l2: 0.0384325
[1800]	valid_0's l2: 0.0384105
[1900]	valid_0's l2: 0.0383506
[2000]	valid_0's l2: 0.0383318
[2100]	valid_0's l2: 0.0382954
[2200]	valid_0's l2: 0.0382506
[2300]	valid_0's l2: 0.0382211
[2400]	valid_0's l2: 0.0382165
[2500]	valid_0's l2: 0.0382018
[2600]	valid_0's l2: 0.0382024
Early stopping, best iteration is:
[2580]	valid_0's l2: 0.0381928
0.03819281208937638
cv score:
0.08925003060923895
current score: 0.0891806487346206 7
[0.06681686371455038, 0.06563367071015905, 0.1318135179323028, 0.1437932885998061, 0.03819281208937638]
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.172733
[200]	valid_0's l2: 0.0942412
[300]	valid_0's l2: 0.0770116
[400]	valid_0's l2: 0.0723427
[500]	valid_0's l2: 0.0704749
[600]	valid_0's l2: 0.0695029
[700]	valid_0's l2: 0.0689409
[800]	valid_0's l2: 0.068543
[900]	valid_0's l2: 0.0683255
[1000]	valid_0's l2: 0.0681195
[1100]	valid_0's l2: 0.0679427
[1200]	valid_0's l2: 0.0677957
[1300]	val

[700]	valid_0's l2: 0.0393236
[800]	valid_0's l2: 0.0390737
[900]	valid_0's l2: 0.0389065
[1000]	valid_0's l2: 0.0387699
[1100]	valid_0's l2: 0.0386495
[1200]	valid_0's l2: 0.0385736
[1300]	valid_0's l2: 0.038516
[1400]	valid_0's l2: 0.0384182
[1500]	valid_0's l2: 0.0383537
[1600]	valid_0's l2: 0.038343
[1700]	valid_0's l2: 0.0383206
[1800]	valid_0's l2: 0.0383004
[1900]	valid_0's l2: 0.0382905
[2000]	valid_0's l2: 0.038292
Early stopping, best iteration is:
[1916]	valid_0's l2: 0.0382817
0.038281700988002175
cv score:
0.08934454885262162
current score: 0.08917413978764507 9
[0.06642777019992892, 0.06562924905487665, 0.1325264436883684, 0.14385758033193194, 0.038281700988002175]
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.173614
[200]	valid_0's l2: 0.0948406
[300]	valid_0's l2: 0.0774712
[400]	valid_0's l2: 0.0725573
[500]	valid_0's l2: 0.0705724
[600]	valid_0's l2: 0.0696137
[700]	valid_0's l2: 0.0690889
[800]	valid_0's l2: 0.068745
[900]	valid

In [13]:
np.array ( x_score).mean()

0.08934710281117901

In [14]:
len( x_score )

10

In [15]:
len( best_trees[::5] )

10

In [16]:
len( encoders )

5

In [17]:
import pickle 

In [18]:

pickle.dump( encoders, open( "../data/encoders.p", "wb" ) , protocol = 2 )

In [19]:
pickle.dump( best_trees[::5], open( "../data/models.p", "wb" ) , protocol = 2 )

In [20]:
X.shape

(10235, 8)

In [21]:
encoders["stratum"].classes_

array(['0', '1', '2', '3', '4', '5', '6'], dtype=object)

In [22]:
encoders["floor"].classes_

array(['0.0', '1', '10', '11', '12', '13', '14', '15', '16ª', '2', '3',
       '4', '5', '6', '7', '8', '9', 'Otros'], dtype=object)

In [23]:
encoders["age"].classes_

array(['1 a 8 años', '16 a 30 años', '9 a 15 años', 'Menos de 1 año',
       'Más de 30 años'], dtype=object)