In [1]:
import numpy as np
import pandas as pd
import csv
from time import time

from sklearn.model_selection import GridSearchCV
from datetime import datetime
import category_encoders as ce

In [2]:
train = pd.read_csv('train.csv')

In [3]:
#Capitales de los Estados de México
capitales=['Aguascalientes','Mexicali','La Paz','Campeche','Tuxtla Gutiérrez','Chihuahua','Saltillo','Colima','Durango','Guanajuato','Chilpancingo de los Bravo','Pachuca','Guadalajara','Toluca','Morelia','Cuernavaca','Tepic','Monterrey','Oaxaca de Juárez','Puebla','Querétaro','Chetumal','San Luis Potosí','Culiacán','Hermosillo','Villahermosa','Victoria','Tlaxcala','Xalapa','Mérida','Zacatecas','Cuauhtémoc']

In [4]:
#Función que establece si una ciudad se encuentra en la capital del estado
def EstaEnCapital(Ciudad):
    for capital in capitales:
        if(Ciudad==capital):
            return 1
    return 0

In [5]:
train['Esta_Capital']=train['ciudad'].apply(EstaEnCapital)

In [6]:
train = train.drop('id', axis=1)
train = train.drop('direccion', axis=1)
train = train.drop('lat', axis=1)
train = train.drop('lng', axis=1)

In [7]:
#Reemplazo los valores nulos por el valor promedio de ese featuer
train[['antiguedad']] = train[['antiguedad']].fillna(value=8.116114)
train[['habitaciones']] = train[['habitaciones']].fillna(value=2.902326)
train[['garages']] = train[['garages']].fillna(value=1.546874)
train[['banos']] = train[['banos']].fillna(value=2.132417)
train[['metroscubiertos']] = train[['metroscubiertos']].fillna(value=174.016774)
train[['metrostotales']] = train[['metrostotales']].fillna(value=176.765145)
train[['idzona']] = train[['idzona']].fillna(value=2141183.096329)

In [8]:
def oneHotEncoding(df, columna):
    one_hot = pd.get_dummies(df[columna])
    df.drop(columna, axis=1, inplace=True)
    df = pd.concat([df, one_hot], axis=1)
    return df

In [9]:
train = oneHotEncoding(train,'provincia')
train = oneHotEncoding(train,'tipodepropiedad')

train = train.drop('Garage', axis=1)
train = train.drop('Hospedaje', axis=1)

In [10]:
#Hacemos la conversión de fechas
train['Fecha']=pd.to_datetime(train['fecha'])
train.drop(columns={'fecha'},inplace=True)

train['Anio']=train['Fecha'].dt.year
train = train.drop('Fecha', axis=1)

In [11]:
target_enc = ce.TargetEncoder(train['ciudad'])

# Fit the encoder using the categorical features and target
target_enc.fit(train['ciudad'], train['precio'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train['ciudad']).add_suffix('_target'))

train = train.drop('ciudad', axis=1)

In [12]:
# Palabras del feature titulo que voy a usar para construir mas features. Fueron sacadas en parte a "ojo".
# Por un lado busqué las 100 palabras mas comunes de las propiedaades mas caras (mayor que 3350000 pesos)(para ver
# el codigo con el que busqué las palabras mas comunes, ver mi otro notebook), luego busqué las 100 palabras mas
# comunes de las propiedades mas baratas (menor a 940000 pesos). Ignoré las palabras que aparecian en ambas listas
# con mas o menos la misma cantidad de veces, y dejé el resto. Luego repetí lo mismo para la descripcion.
columnas_titulo = ['vill','remat','cerc','haciend','plant','infonavit','bosqu','credit','bancari','adjudic','puebl','ecatepec','morel','buen',
'izcalli','cuautitl','guadalup','coacalc','hipotecari','ciud','esquin','iztapalap','apodac','bañ','minut','plaz','juan','acept','vist','mision',
'toluc','precios','tecamac','baj','mexic','ubicacion','mader','colin','habit','francisc','lind','jos','dor','chihuahu','unid','aprovech','pachuc',
'rincon','pis','bien','terren','cot','cuautitlan','fovissst','centric','tlajomulc','resident','estren','club','luj','polanc','cumbr','prevent',
'satelit','jeronim','exclus','interlom','condes','pedregal','huixquiluc','angelopolis','benit','herradur','naucalp','federal','distrit','pedr',
'golf','tlalp','torr','juriquill','verd','modern','alvar','acab','obregon','rom','country','palm','cuajimalp','estil','horizontal','seccion','vigil',
'coyoac','inversion','monterrey','angel','espaci','coyoacan','magnif','campestr','oficin']

In [13]:
for columna in columnas_titulo:
    train[columna+'_titulo']=train.titulo.str.contains(columna,case=False)

In [14]:
columnas_descripcion = ['credit','cerc','acept','infonavit','lav','minut','closet','principal','espaci','escuel','strong','/strong','jardin','bancari',
'cont','comercial','tip','cit','inform','equip','vigil','cas','proteccion','verd','ceram','pag','plaz','parqu','puert','inmuebl','tramit','cuart',
'bien','remat','loset','propied','acab','aven','closets','ciud','residencial','condicion','opcion','traser','list','tranquil','vestidor','terraz',
'vist','estudi','salon','hermos','alberc','visit','cad','ilumin','jueg','mader','desayun','dobl','comun','luj','granit','resident','gard','jacuzzi',
'gimnasi','precios','asador','roof','independient','cubiert','remodel','balcon','cerr','elev','marmol','exclus','lugar','grand','cistern','altur',
'electr','hor']

In [15]:
for columna in columnas_descripcion:
    train[columna+'_descripcion']=train.descripcion.str.contains(columna,case=False)

In [16]:
train = train.fillna(value=0)
train = train.drop('titulo', axis=1)
train = train.drop('descripcion', axis=1)

In [17]:
train_label = []
train_data = []

train_label = train["precio"]
train_data = train.iloc[:, train.columns != 'precio']

del train

In [18]:
test = pd.read_csv('test.csv')

In [19]:
test['Esta_Capital']=test['ciudad'].apply(EstaEnCapital)

In [20]:
test = test.drop('id', axis=1)
test = test.drop('direccion', axis=1)
test = test.drop('lat', axis=1)
test = test.drop('lng', axis=1)

In [21]:
test[['antiguedad']] = test[['antiguedad']].fillna(value=8.116114)
test[['habitaciones']] = test[['habitaciones']].fillna(value=2.902326)
test[['garages']] = test[['garages']].fillna(value=1.546874)
test[['banos']] = test[['banos']].fillna(value=2.132417)
test[['metroscubiertos']] = test[['metroscubiertos']].fillna(value=174.016774)
test[['metrostotales']] = test[['metrostotales']].fillna(value=176.765145)
test[['idzona']] = test[['idzona']].fillna(value=2141183.096329)

In [22]:
test = oneHotEncoding(test,'provincia')
test = oneHotEncoding(test,'tipodepropiedad')

In [23]:
#Hacemos la conversión de fechas
test['Fecha']=pd.to_datetime(test['fecha'])
test.drop(columns={'fecha'},inplace=True)

test['Anio'] = test['Fecha'].dt.year
test = test.drop('Fecha', axis=1)

In [24]:
test = test.join(target_enc.transform(test['ciudad']).add_suffix('_target'))
test = test.drop('ciudad', axis=1)
del target_enc

In [25]:
for columna in columnas_titulo:
    test[columna+'_titulo']=test.titulo.str.contains(columna,case=False)

In [26]:
for columna in columnas_descripcion:
    test[columna+'_descripcion']=test.descripcion.str.contains(columna,case=False)

In [27]:
test = test.fillna(value=0)
test = test.drop('titulo', axis=1)
test = test.drop('descripcion', axis=1)

In [28]:
def write_submission(test_data, prediction, file_output):
    
    archivo_entrada = open(test_data)
    entrada_csv = csv.reader(archivo_entrada)
    next(entrada_csv, None)  # skip the headers

    archivo_salida = open(file_output, 'w')
    submit_csv = csv.writer(archivo_salida)
    submit_csv.writerow(['id', 'target'])

    for reg1, reg2  in zip(entrada_csv, prediction):
        linea = [reg1[0], round(reg2, 2)]
        submit_csv.writerow(linea)
    archivo_salida.close()

In [29]:
#-----------------------------------------------------------------
# Modelo Random Forest Regressor
#-----------------------------------------------------------------
from sklearn.ensemble import RandomForestRegressor

In [30]:
modeloRFR=RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=150)

In [31]:
modeloRFR.fit(train_data, train_label)

result = modeloRFR.predict(test)

In [32]:
write_submission("test.csv", result, "RFR_decripYtitulo.csv")

In [42]:
params = [ {'n_estimators': [10, 50, 100]},]

modeloRFR = RandomForestRegressor(random_state=0, n_jobs=-1)
grid = GridSearchCV(modeloRFR, params, n_jobs=-1)
grid.fit(train_data, train_label)

acc = grid.score(train_data, train_label)
print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
print("[INFO] grid search best parameters: {}".format(grid.best_params_))

[INFO] grid search accuracy: 96.91%
[INFO] grid search best parameters: {'n_estimators': 100}


In [84]:
#-----------------------------------------------------------------
# Modelo GradientBoostingRegressor
#-----------------------------------------------------------------
from sklearn.ensemble import GradientBoostingRegressor

In [15]:
params = [ {'n_estimators': [50, 100, 150]},]

modeloGBR = GradientBoostingRegressor(random_state=0)
grid = GridSearchCV(modeloGBR, params, n_jobs=-1)
grid.fit(train_data, train_label)

acc = grid.score(train_data, train_label)
print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
print("[INFO] grid search best parameters: {}".format(grid.best_params_))

[INFO] grid search accuracy: 66.77%
[INFO] grid search best parameters: {'n_estimators': 150}


In [16]:
modeloGBR=GradientBoostingRegressor(random_state=0, n_estimators=150)

In [17]:
modeloGBR.fit(train_data, train["precio"])

result = modeloGBR.predict(test)

In [18]:
write_submission("test.csv", result, "GBR.csv")

In [19]:
del modeloGBR

In [20]:
#-----------------------------------------------------------------
# Modelo Bagging Regressor
#-----------------------------------------------------------------
from sklearn.ensemble import BaggingRegressor

In [None]:
params = [ {'n_estimators': [10, 50, 100]},]

modeloBG = BaggingRegressor(random_state=0)
grid = GridSearchCV(modeloBG, params, n_jobs=-1)
grid.fit(train_data, train_label)

acc = grid.score(train_data, train_label)
print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
print("[INFO] grid search best parameters: {}".format(grid.best_params_))

In [None]:
modeloBG=BaggingRegressor(random_state=0, n_jobs=-1, n_estimators=150)

In [None]:
modeloBG.fit(train_data, train["precio"])

result = modeloBG.predict(test)

In [None]:
write_submission("test.csv", result, "BG.csv")

In [None]:
del modeloBG

In [11]:
#-----------------------------------------------------------------
# Modelo Multi-layer Perceptron regressor
#-----------------------------------------------------------------
from sklearn.neural_network import MLPRegressor

In [None]:
params = [
    {
        'activation' : ('logistic', 'tanh', 'relu',),
        'solver' : ('lbfgs','adam','sgd',),
        'learning_rate' : ('constant', 'invscaling', 'adaptive',),
    },
]

modeloMLPR = MLPRegressor(random_state=0)
grid = GridSearchCV(modeloMLPR, params, n_jobs=-1)
grid.fit(train_data, train_label)

acc = grid.score(train_data, train_label)
print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
print("[INFO] grid search best parameters: {}".format(grid.best_params_))

In [None]:
modeloMLPR=MLPRegressor(random_state=0, n_estimators=150)

In [None]:
modeloMLPR.fit(train_data, train["precio"])

result = modeloMLPR.predict(test)

In [None]:
write_submission("test.csv", result, "MLPR.csv")