In [1]:
import pandas as pd
import math

from sklearn import metrics
from collections import OrderedDict, defaultdict
from scipy import sparse
from sklearn.preprocessing import LabelBinarizer
import gc
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords

from keras.models import Model
from keras.layers import *
from keras.layers.merge import concatenate, dot, multiply, add
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from keras.optimizers import *
from keras import backend as K
from keras import losses
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from skopt import gp_minimize
from hyperopt import *

from tqdm import tqdm

from pathlib import PurePath

from fastai.structured import *
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Métrica de evaluación
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

def RMSE(actual, pred):
    return (np.mean((actual - pred) ** 2)) **.5

In [12]:
def tokenizer_fit(col, df_train, df_test):
    tokenizer = Tokenizer(filters='', lower=True)
    tokenizer.fit_on_texts(df_train[col])
    train_sequences = np.array(tokenizer.texts_to_sequences(df_train[col]))
    test_sequences = np.array(tokenizer.texts_to_sequences(df_test[col]))
    
    return train_sequences, test_sequences, tokenizer

In [3]:
def objective_nn():
    K.clear_session()
    inp_region = Input(shape=(1, ), name='inp_region')
    emb_region = Embedding(len_region, size_emb_region, name='emb_region')(inp_region)
    emb_region = Reshape(target_shape=(size_emb_region,))(emb_region)
    
    inp_parentcatname = Input(shape=(1, ), name='inp_parent_category_name')
    emb_parentcatname = Embedding(len_parentcatname, size_emb_parentcatname, name='emb_parent_category_name')(inp_parentcatname)
    emb_parentcatname = Reshape(target_shape=(size_emb_parentcatname,))(emb_parentcatname)
    
    inp_catname = Input(shape=(1, ), name='inp_category_name')
    emb_catname = Embedding(len_catname, size_emb_catname, name="emb_category_name")(inp_catname)
    emb_catname = Reshape(target_shape=(size_emb_catname,))(emb_catname)
    
    inp_usertype = Input(shape=(1, ), name='inp_user_type')
    emb_usertype = Embedding(len_usertype, size_emb_usertype, name='emb_user_type')(inp_usertype)
    emb_usertype = Reshape(target_shape=(size_emb_usertype,))(emb_usertype)
        
    inp_city = Input(shape=(1, ), name='inp_city')
    emb_city = Embedding(len_city, size_emb_city, name='emb_city')(inp_city)
    emb_city = Reshape(target_shape=(size_emb_city,))(emb_city)
    
    inp_week = Input(shape=(1, ), name='inp_week')
    emb_week = Embedding(len_week, size_emb_week, name='emb_week')(inp_week)
    emb_week = Reshape(target_shape=(size_emb_week,))(emb_week)
    
    inp_day = Input(shape=(1, ), name='inp_day_of_month')
    emb_day = Embedding(len_day_month, size_emb_day_month, name='emb_day_of_month')(inp_day)
    emb_day = Reshape(target_shape=(size_emb_day_month,))(emb_day)
    
    inp_imgt1 = Input(shape=(1, ), name='inp_imgt1')
    emb_imgt1 = Embedding(len_imgt1, size_emb_imgt1, name='emb_imgt1')(inp_imgt1)
    emb_imgt1 = Reshape(target_shape=(size_emb_imgt1,))(emb_imgt1)
    
    inp_price = Input(shape=(1, ), name='inp_price')
    #x = GaussianNoise(1)(inp_price)
    #emb_price = Dense(size_emb_price, activation='tanh', name='emb_price')(x)

    inp_itemseq = Input(shape=(1, ), name='inp_itemseq')
    #emb_itemseq = Dense(size_emb_itemseq, activation='tanh', name='emb_itemseq')(inp_itemseq)
    
    #inp_feat_eng = Input(shape=(len_feat_eng, ), name='inp_feat_eng')
    #x = GaussianNoise(1)(inp_feat_eng)
    #emb_feat_eng = Dense(size_emb_feat_eng, activation='tanh', name='emb_feat_eng')(x)
    
    conc_cat_and_cont = concatenate([emb_region, emb_parentcatname, emb_catname, emb_usertype, emb_city, emb_week, 
                             emb_day, emb_imgt1, inp_price, inp_itemseq], axis=-1, name='concat_cat_and_cont')
    
#     x = Dropout(dropout)(conc_cat_and_cont)
#     x = Dense(25, activation='relu')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(dropout)(x)
#     conc_cat_and_cont = Dense(25, activation='relu')(x)
    

    
    ### title
    # channel 1
   
    embedding = Embedding(vocabulary_size, size_emb_title, name='emb_title_1')
    
#     input_title_1 = Input(shape=(maxlen_title,), name='input_title_1')
#     embedding_for_title = embedding(input_title_1)
#     x = Dropout(dropout)(embedding_for_title)
#     x = GRU(5, dropout=dropout, recurrent_dropout=dropout, return_sequences=True)(x)
#     title_layer = GRU(5, dropout=dropout, recurrent_dropout=dropout, return_sequences=False)(x)
    
    
    #x = BatchNormalization()(x)
    #x = Dropout(dropout)(x)
    #x = Conv1D(filters=8, kernel_size=1, activation='relu')(x)
    #x = BatchNormalization()(x)
    #x = Dropout(dropout)(x)
    #x = Conv1D(filters=8, kernel_size=1, activation='relu')(x)
    #avg_pool = GlobalAveragePooling1D()(x)
    #max_pool = GlobalMaxPooling1D()(x)
    #title_layer = concatenate([avg_pool, max_pool])
    
    input_descr_1 = Input(shape=(maxlen_description, ), name='inp_descr_1')
    embedding_for_descr = embedding(input_descr_1)
    #x = Dropout(dropout)(embedding_for_descr)
    descr_layer = GRU(50)(embedding_for_descr)
    # = GRU(5, dropout=dropout, recurrent_dropout=dropout, return_sequences=False)(x)
    

    conc_all = concatenate([conc_cat_and_cont, descr_layer], axis=-1)
    
    
    ###
    #x = Dropout(dropout)(conc_all)
    x = Dense(500, activation='relu')(conc_all)
    #x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(dropout)(x)
    y = Dense(1, activation='sigmoid', name='output')(x)

    model = Model(inputs = [inp_region, inp_parentcatname, inp_catname, inp_usertype, inp_city, inp_week, inp_day, inp_imgt1, inp_price, inp_itemseq,
                            input_descr_1, #input_descr_2, input_descr_3,
                            #inp_descr,
                            #input_title_1, #input_title_2, input_title_3,
                            #input_params_1,# input_params_2, input_params_3,
                            #input_img
                           ], outputs = y)
    
    return model

In [4]:
df = pd.read_csv('train.csv', index_col='id', parse_dates=['fecha'])
df_test = pd.read_csv('test.csv', parse_dates=['fecha'])

In [5]:
text_features = ["titulo", "descripcion", "direccion"]

for col in text_features:
    df[col].fillna('', inplace=True)
    df[col] = df[col].str.lower()
    df_test[col].fillna('', inplace=True)
    df_test[col] = df_test[col].str.lower()

In [6]:
df['año'] = df.fecha.dt.year
df['mes'] = df.fecha.dt.month

df_test['año'] = df_test.fecha.dt.year
df_test['mes'] = df_test.fecha.dt.month

df.drop(['fecha'], axis=1, inplace=True)
df_test.drop(['fecha'], axis=1, inplace=True)

In [7]:
df['ciudad'] = df['ciudad'].str.lower()
df['provincia'] = df['provincia'].str.lower()

df_test['ciudad'] = df_test['ciudad'].str.lower()
df_test['provincia'] = df_test['provincia'].str.lower()

In [8]:
le = LabelEncoder()
df_all = pd.concat([df, df_test], axis=0)
# # Iterate through the columns
for col in df_all:
    if col == 'titulo' or col == 'descripcion' or col == 'direccion':
        continue
        
    if df_all[col].dtype == 'object':
        le.fit(df_all[col].astype(str))
        # Transform both training and testing data
        df[col] = le.transform(df[col].astype(str))
        df_test[col] = le.transform(df_test[col].astype(str))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [11]:
vocabulary_size = 10000

tokenizer = Tokenizer(num_words=vocabulary_size, lower='True')
tokenizer.fit_on_texts(np.append(df['titulo'].values, [df['descripcion'].values, df['direccion'].values]))


title_train = tokenizer.texts_to_sequences(df['titulo'].values)
title_test = tokenizer.texts_to_sequences(df_test['titulo'].values)

descr_train = tokenizer.texts_to_sequences(df['descripcion'].values)
descr_test = tokenizer.texts_to_sequences(df_test['descripcion'].values)

dir_train = tokenizer.texts_to_sequences(df['direccion'].values)
dir_test = tokenizer.texts_to_sequences(df_test['direccion'].values)

maxlen_title = 30
title_train_pad = pad_sequences(title_train, maxlen=maxlen_title)
title_test_pad = pad_sequences(title_test, maxlen=maxlen_title)

maxlen_description = 50
descr_train_pad = pad_sequences(descr_train, maxlen=maxlen_description)
descr_test_pad = pad_sequences(descr_test, maxlen=maxlen_description)

maxlen_params = 30
dir_train_pad = pad_sequences(dir_train, maxlen=maxlen_params)
dir_test_pad = pad_sequences(dir_test, maxlen=maxlen_params)

In [None]:
train_tipodepropiedad, test_tipodepropiedad, tokenizer_tipodepropiedad = tokenizer_fit('tipodepropiedad', df, df_test)
train_idzona, test_idzona, tokenizer_idzona = tokenizer_fit('idzona', df, df_test)
train_ciudad, test_ciudad, tokenizer_ciudad = tokenizer_fit('ciudad', df, df_test)
train_provincia, test_provincia, tokenizer_provincia = tokenizer_fit('provincia', df, df_test)
train_año, test_año, tokenizer_año = tokenizer_fit('año', df, df_test)
train_mes, test_mes, tokenizer_mes = tokenizer_fit('mes', df, df_test)

In [None]:
## categorical
size_emb_tipodepropiedad = 25
size_emb_idzona = 19719
size_emb_ciudad = 876
size_emb_provincia = 33

size_emb_año = 5
size_emb_mes = 12

size_emb_antiguedad = 1
size_emb_habitaciones = 1

size_emb_garages = 2
size_emb_banos = 2
size_emb_gimnasio = 2
size_emb_usosmultiples = 2
size_emb_piscina = 2
size_emb_escuelascercanas = 2
size_emb_centroscomercialescercanos = 2