In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# plots
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
%matplotlib inline

## Partimos de los datos genereados por Notebook creacionFeatures

In [2]:
df_train = pd.read_csv('../data/train_con_features.csv', index_col='Unnamed: 0')
df_test = pd.read_csv('../data/test_con_features.csv', index_col='Unnamed: 0')

In [3]:
df_train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'cantidad_amenities',
       'cantidad_servicios', 'relacion_metros', 'metros_por_habitacion',
       'metros_por_bano', 'metros_por_amenities', 'bano_por_habitacion',
       'garages_por_habitacion', 'precio_prom_ciudad', 'precio_prom_propiedad',
       'provincia_top5', 'es_Apartamento', 'es_Casa_en_condominio', 'es_Casa',
       'es_Terreno', 'es_Terreno_comercial', 'es_Local_Comercial',
       'es_Quinta_Vacacional', 'es_Oficina_comercial', 'es_Edificio',
       'es_Casa_uso_de_suelo', 'es_Local_en_centro_comercial',
       'es_Bodega_comercial', 'es_Otros', 'es_Villa', 'es_Duplex',
       'es_Inmuebles_productivos_urbanos', 'es_Departamento_Compar

### Eliminamos los features generados relacionados con TipoPropiedad

In [4]:
lst_tipos_propiedades = ['es_Apartamento', 'es_Casa_en_condominio', 'es_Casa', 'es_Terreno',
                         'es_Terreno_comercial', 'es_Local_Comercial', 'es_Quinta_Vacacional',
                         'es_Oficina_comercial', 'es_Edificio', 'es_Casa_uso_de_suelo',
                         'es_Local_en_centro_comercial', 'es_Bodega_comercial', 'es_Otros',
                         'es_Villa', 'es_Duplex', 'es_Inmuebles_productivos_urbanos',
                         'es_Departamento_Compartido', 'es_Nave_industrial', 'es_Rancho',
                         'es_Terreno_industrial', 'es_Huerta', 'es_Lote', 'es_Hospedaje', 'es_Garage']

df_train = df_train.drop(lst_tipos_propiedades, axis=1)
df_test = df_test.drop(lst_tipos_propiedades, axis=1)
df_train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'cantidad_amenities',
       'cantidad_servicios', 'relacion_metros', 'metros_por_habitacion',
       'metros_por_bano', 'metros_por_amenities', 'bano_por_habitacion',
       'garages_por_habitacion', 'precio_prom_ciudad', 'precio_prom_propiedad',
       'provincia_top5', 'seguridad_descripcion', 'moderno_descripcion',
       'hermoso_descripcion', 'estado_descripcion', 'seguridad_titulo',
       'moderno_titulo', 'hermoso_titulo', 'estado_titulo',
       'cantidad_palabras_descripcion', 'relacion_palabras_descripcion',
       'es_avenida', 'es_ciudad_capital', 'ciudad_turistica_top15', 'anio',
       'mes'],
      dtype='object')

### Eliminamos features repetidos y complejos

In [5]:
drop = ['titulo', 'descripcion', 'direccion', 'fecha']

df_train = df_train.drop(drop, axis=1)
df_test = df_test.drop(drop, axis=1)
df_train.head()

Unnamed: 0,id,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,...,moderno_titulo,hermoso_titulo,estado_titulo,cantidad_palabras_descripcion,relacion_palabras_descripcion,es_avenida,es_ciudad_capital,ciudad_turistica_top15,anio,mes
0,254099,Apartamento,Benito Juárez,Distrito Federal,8.116114,2.0,1.0,2.0,80.0,80.0,...,0,0,0,0,0.0,1,0,0,2015,8
1,53461,Casa en condominio,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,2.0,268.0,180.0,...,0,0,0,1,0.007968,1,0,0,2013,6
2,247984,Casa,Tonalá,Jalisco,5.0,3.0,2.0,2.0,144.0,166.0,...,0,0,0,10,0.046957,0,0,0,2015,10
3,209067,Casa,Zinacantepec,Edo. de México,1.0,2.0,1.0,1.0,63.0,67.0,...,0,0,0,0,0.0,0,0,0,2012,3
4,185997,Apartamento,Zapopan,Jalisco,10.0,2.0,1.0,1.0,95.0,95.0,...,0,0,0,1,0.074108,0,0,0,2016,6


In [6]:
df_train.shape

(240000, 45)

## Aplicamos encodings a los features categoricos

In [7]:
import category_encoders as ce

cat_features = ['tipodepropiedad', 'ciudad', 'provincia']

cb_features = ['ciudad']
oh_features = ['tipodepropiedad', 'provincia']

#Elegimos un metodo de encode para estos features, y comentamos el resto de los metodos
#O elegimos distintos metodos dependiendo del feature

### CatBoost Encoding

In [8]:
catb_enc = ce.CatBoostEncoder(cols=cat_features)
catb_enc.fit(df_train[cat_features], df_train['precio'])

df_train = df_train.join(catb_enc.transform(df_train[cat_features]).add_suffix('_cb'))
df_test = df_test.join(catb_enc.transform(df_test[cat_features]).add_suffix('_cb'))

### Target Encoding

In [9]:
'''target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(df_train[cat_features], df_train['precio'])

df_train = df_train.join(target_enc.transform(df_train[cat_features]).add_suffix('_target'))
df_test = df_test.join(target_enc.transform(df_test[cat_features]).add_suffix('_target'))'''

"target_enc = ce.TargetEncoder(cols=cat_features)\ntarget_enc.fit(df_train[cat_features], df_train['precio'])\n\ndf_train = df_train.join(target_enc.transform(df_train[cat_features]).add_suffix('_target'))\ndf_test = df_test.join(target_enc.transform(df_test[cat_features]).add_suffix('_target'))"

### Count Encoding

In [10]:
#NO USAR!! GENERA OVERFITING
'''count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(df_train[cat_features])

df_train = df_train.join(count_encoded.add_suffix("_count"))
df_test = df_test.join(count_encoded.add_suffix("_count"))'''

'count_enc = ce.CountEncoder()\ncount_encoded = count_enc.fit_transform(df_train[cat_features])\n\ndf_train = df_train.join(count_encoded.add_suffix("_count"))\ndf_test = df_test.join(count_encoded.add_suffix("_count"))'

### Binary Encoding

In [11]:
'''binary_enc = ce.BinaryEncoder()
binary_encoded = binary_enc.fit_transform(df_train[cat_features])

df_train = df_train.join(binary_encoded.add_suffix("_binary"))
df_test = df_test.join(binary_encoded.add_suffix("_binary"))'''

'binary_enc = ce.BinaryEncoder()\nbinary_encoded = binary_enc.fit_transform(df_train[cat_features])\n\ndf_train = df_train.join(binary_encoded.add_suffix("_binary"))\ndf_test = df_test.join(binary_encoded.add_suffix("_binary"))'

### One Hot Encoding

In [12]:
'''one_hot_enc = ce.OneHotEncoder()
one_hot_encoded = one_hot_enc.fit_transform(df_train[oh_features])

df_train = df_train.join(one_hot_encoded.add_suffix("_oh"))
df_test = df_test.join(one_hot_encoded.add_suffix("_oh"))'''

'one_hot_enc = ce.OneHotEncoder()\none_hot_encoded = one_hot_enc.fit_transform(df_train[oh_features])\n\ndf_train = df_train.join(one_hot_encoded.add_suffix("_oh"))\ndf_test = df_test.join(one_hot_encoded.add_suffix("_oh"))'

In [13]:
df_train.shape

(240000, 48)

In [14]:
df_test.shape

(60000, 47)

In [15]:
#Dropeamos los features con strings ya encodeados
#Dejamos los dataset listos para modelar
df_train = df_train.drop(cat_features, axis=1)
df_test = df_test.drop(cat_features, axis=1)

## Feature Selection

In [16]:
from catboost import CatBoostRegressor

feature_cols = df_train.columns.tolist()
feature_cols.remove('precio')

X = df_train[feature_cols]
y = df_train['precio']

CatBoost = CatBoostRegressor(loss_function='MAE')
CatBoost_fit = CatBoost.fit(X, y)

0:	learn: 1490623.6351114	total: 124ms	remaining: 2m 4s
1:	learn: 1464769.0244118	total: 195ms	remaining: 1m 37s
2:	learn: 1438271.4563704	total: 268ms	remaining: 1m 29s
3:	learn: 1413165.8561171	total: 347ms	remaining: 1m 26s
4:	learn: 1389467.6304808	total: 421ms	remaining: 1m 23s
5:	learn: 1366070.6569391	total: 495ms	remaining: 1m 21s
6:	learn: 1342229.4118884	total: 574ms	remaining: 1m 21s
7:	learn: 1320036.7122093	total: 654ms	remaining: 1m 21s
8:	learn: 1299420.6922933	total: 730ms	remaining: 1m 20s
9:	learn: 1278535.1897723	total: 813ms	remaining: 1m 20s
10:	learn: 1259489.6793438	total: 895ms	remaining: 1m 20s
11:	learn: 1239847.1230370	total: 973ms	remaining: 1m 20s
12:	learn: 1221367.5898744	total: 1.06s	remaining: 1m 20s
13:	learn: 1203233.9643008	total: 1.13s	remaining: 1m 19s
14:	learn: 1186716.6545049	total: 1.21s	remaining: 1m 19s
15:	learn: 1171221.0454911	total: 1.29s	remaining: 1m 19s
16:	learn: 1155703.4861286	total: 1.37s	remaining: 1m 19s
17:	learn: 1140803.669668

144:	learn: 732747.4240909	total: 11.2s	remaining: 1m 6s
145:	learn: 732247.6458405	total: 11.3s	remaining: 1m 6s
146:	learn: 731664.4494773	total: 11.4s	remaining: 1m 5s
147:	learn: 731156.2287782	total: 11.4s	remaining: 1m 5s
148:	learn: 730817.9783195	total: 11.5s	remaining: 1m 5s
149:	learn: 729966.0622744	total: 11.6s	remaining: 1m 5s
150:	learn: 729619.4047282	total: 11.7s	remaining: 1m 5s
151:	learn: 728905.7610799	total: 11.8s	remaining: 1m 5s
152:	learn: 727780.9796615	total: 11.8s	remaining: 1m 5s
153:	learn: 727290.8352105	total: 11.9s	remaining: 1m 5s
154:	learn: 726900.9264025	total: 12s	remaining: 1m 5s
155:	learn: 726566.8422946	total: 12.1s	remaining: 1m 5s
156:	learn: 726155.4926530	total: 12.1s	remaining: 1m 5s
157:	learn: 725113.2948898	total: 12.2s	remaining: 1m 5s
158:	learn: 724790.4697044	total: 12.3s	remaining: 1m 4s
159:	learn: 724460.4843715	total: 12.4s	remaining: 1m 4s
160:	learn: 724028.4334628	total: 12.4s	remaining: 1m 4s
161:	learn: 723563.5191669	total:

290:	learn: 677918.9408812	total: 22.7s	remaining: 55.4s
291:	learn: 677778.3680482	total: 22.8s	remaining: 55.3s
292:	learn: 677588.4238140	total: 22.9s	remaining: 55.3s
293:	learn: 677325.6826297	total: 23s	remaining: 55.2s
294:	learn: 676831.6397861	total: 23.1s	remaining: 55.1s
295:	learn: 676573.3545831	total: 23.1s	remaining: 55s
296:	learn: 676314.2909167	total: 23.2s	remaining: 55s
297:	learn: 676050.4600607	total: 23.3s	remaining: 54.9s
298:	learn: 675715.9378730	total: 23.4s	remaining: 54.8s
299:	learn: 675546.2682651	total: 23.5s	remaining: 54.7s
300:	learn: 675282.0869982	total: 23.5s	remaining: 54.7s
301:	learn: 675224.3976321	total: 23.6s	remaining: 54.6s
302:	learn: 675049.6949584	total: 23.7s	remaining: 54.5s
303:	learn: 674896.2342581	total: 23.8s	remaining: 54.4s
304:	learn: 674339.2646300	total: 23.8s	remaining: 54.3s
305:	learn: 674058.7472898	total: 23.9s	remaining: 54.2s
306:	learn: 673863.5857901	total: 24s	remaining: 54.2s
307:	learn: 673758.4937187	total: 24.1s

437:	learn: 650733.2314020	total: 35.9s	remaining: 46s
438:	learn: 650684.7490156	total: 35.9s	remaining: 45.9s
439:	learn: 650497.5322136	total: 36s	remaining: 45.8s
440:	learn: 650352.2682554	total: 36.1s	remaining: 45.8s
441:	learn: 650275.2531982	total: 36.2s	remaining: 45.7s
442:	learn: 649861.2573231	total: 36.2s	remaining: 45.6s
443:	learn: 649754.4137538	total: 36.3s	remaining: 45.5s
444:	learn: 649540.1108607	total: 36.4s	remaining: 45.4s
445:	learn: 649384.0090398	total: 36.5s	remaining: 45.3s
446:	learn: 649241.3248520	total: 36.6s	remaining: 45.2s
447:	learn: 648941.2643691	total: 36.6s	remaining: 45.1s
448:	learn: 648840.5516752	total: 36.7s	remaining: 45.1s
449:	learn: 648608.8371355	total: 36.8s	remaining: 45s
450:	learn: 648518.5472394	total: 36.9s	remaining: 44.9s
451:	learn: 648439.1511441	total: 37s	remaining: 44.8s
452:	learn: 648324.2583815	total: 37.1s	remaining: 44.8s
453:	learn: 648149.2018694	total: 37.1s	remaining: 44.7s
454:	learn: 648047.0883364	total: 37.2s

584:	learn: 632203.7532989	total: 47.5s	remaining: 33.7s
585:	learn: 632084.0731784	total: 47.6s	remaining: 33.6s
586:	learn: 631987.4539507	total: 47.7s	remaining: 33.5s
587:	learn: 631927.1022083	total: 47.8s	remaining: 33.5s
588:	learn: 631854.1967083	total: 47.8s	remaining: 33.4s
589:	learn: 631779.0215720	total: 47.9s	remaining: 33.3s
590:	learn: 631703.2283488	total: 48s	remaining: 33.2s
591:	learn: 631623.9354348	total: 48.1s	remaining: 33.1s
592:	learn: 631587.6875850	total: 48.1s	remaining: 33s
593:	learn: 631493.6118474	total: 48.2s	remaining: 33s
594:	learn: 631405.8617715	total: 48.3s	remaining: 32.9s
595:	learn: 631324.1559768	total: 48.4s	remaining: 32.8s
596:	learn: 631244.2195942	total: 48.5s	remaining: 32.7s
597:	learn: 631093.1860614	total: 48.5s	remaining: 32.6s
598:	learn: 630987.0431444	total: 48.6s	remaining: 32.5s
599:	learn: 630914.1197218	total: 48.7s	remaining: 32.5s
600:	learn: 630815.4446574	total: 48.8s	remaining: 32.4s
601:	learn: 630780.3802346	total: 48.

730:	learn: 619681.0661345	total: 59s	remaining: 21.7s
731:	learn: 619632.5830106	total: 59.1s	remaining: 21.6s
732:	learn: 619483.7977087	total: 59.2s	remaining: 21.6s
733:	learn: 619465.4590934	total: 59.3s	remaining: 21.5s
734:	learn: 619396.9787856	total: 59.3s	remaining: 21.4s
735:	learn: 619357.9516380	total: 59.4s	remaining: 21.3s
736:	learn: 619270.8608907	total: 59.5s	remaining: 21.2s
737:	learn: 619226.1348647	total: 59.6s	remaining: 21.2s
738:	learn: 619111.4309469	total: 59.7s	remaining: 21.1s
739:	learn: 619024.9010019	total: 59.7s	remaining: 21s
740:	learn: 618977.6933149	total: 59.8s	remaining: 20.9s
741:	learn: 618927.0104442	total: 59.9s	remaining: 20.8s
742:	learn: 618883.2344022	total: 60s	remaining: 20.7s
743:	learn: 618866.2673319	total: 1m	remaining: 20.7s
744:	learn: 618738.5107079	total: 1m	remaining: 20.6s
745:	learn: 618678.3943830	total: 1m	remaining: 20.5s
746:	learn: 618602.6650015	total: 1m	remaining: 20.4s
747:	learn: 618522.0674659	total: 1m	remaining: 2

876:	learn: 610347.6902591	total: 1m 10s	remaining: 9.92s
877:	learn: 610330.9086327	total: 1m 10s	remaining: 9.84s
878:	learn: 610264.6693514	total: 1m 10s	remaining: 9.76s
879:	learn: 610225.9533847	total: 1m 10s	remaining: 9.68s
880:	learn: 610186.1133977	total: 1m 11s	remaining: 9.6s
881:	learn: 610165.1869104	total: 1m 11s	remaining: 9.52s
882:	learn: 610138.2201531	total: 1m 11s	remaining: 9.44s
883:	learn: 610076.8136706	total: 1m 11s	remaining: 9.36s
884:	learn: 610036.2338546	total: 1m 11s	remaining: 9.28s
885:	learn: 609964.5146277	total: 1m 11s	remaining: 9.2s
886:	learn: 609924.7496205	total: 1m 11s	remaining: 9.11s
887:	learn: 609853.0272531	total: 1m 11s	remaining: 9.03s
888:	learn: 609786.6942535	total: 1m 11s	remaining: 8.95s
889:	learn: 609700.9105945	total: 1m 11s	remaining: 8.87s
890:	learn: 609666.2023498	total: 1m 11s	remaining: 8.79s
891:	learn: 609627.4048821	total: 1m 11s	remaining: 8.71s
892:	learn: 609536.9347468	total: 1m 12s	remaining: 8.63s
893:	learn: 6094

In [17]:
features = pd.DataFrame(index=feature_cols)
features['imp'] = CatBoost_fit.feature_importances_
features = features.sort_values(['imp'], ascending = False)

'''plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 10)
sns.set(style="whitegrid")

g = sns.barplot(y=features.index, x=features.imp, \
                palette=sns.color_palette("Reds_d", 10));

g.set_title('Importancia de Features de CATBoost', fontsize=15);
g.set_xlabel('Valor');
g.set_ylabel('Nombre del Feature');'''

'plt.style.use(\'default\')\nplt.rcParams[\'figure.figsize\'] = (10, 10)\nsns.set(style="whitegrid")\n\ng = sns.barplot(y=features.index, x=features.imp,                 palette=sns.color_palette("Reds_d", 10));\n\ng.set_title(\'Importancia de Features de CATBoost\', fontsize=15);\ng.set_xlabel(\'Valor\');\ng.set_ylabel(\'Nombre del Feature\');'

### Nos quedamos con los k features mas importantes

In [18]:
k=30

features_to_drop = features.nsmallest((df_train.columns.size - k), 'imp').index.tolist()
if 'id' in features_to_drop: features_to_drop.remove('id')
features_to_drop


['moderno_titulo',
 'seguridad_titulo',
 'es_avenida',
 'estado_descripcion',
 'cantidad_servicios',
 'estado_titulo',
 'usosmultiples',
 'seguridad_descripcion',
 'hermoso_titulo',
 'provincia_top5',
 'es_ciudad_capital',
 'escuelascercanas',
 'hermoso_descripcion',
 'moderno_descripcion']

In [19]:
#Dropeamos los features menos importantes
df_train = df_train.drop(features_to_drop, axis=1)
df_test = df_test.drop(features_to_drop, axis=1)

## Guardamos features

In [20]:
df_train.to_csv('../data/train_con_features_encoded.csv')
df_test.to_csv('../data/test_con_features_encoded.csv')