In [2]:
import numpy as np
import pandas as pd
import csv
from time import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from datetime import datetime
import matplotlib.pyplot as plt
import category_encoders as ce
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [3]:
train = pd.read_csv('train.csv')

In [4]:
#Capitales de los Estados de México
capitales=['Aguascalientes','Mexicali','La Paz','Campeche','Tuxtla Gutiérrez','Chihuahua','Saltillo','Colima','Durango','Guanajuato','Chilpancingo de los Bravo','Pachuca','Guadalajara','Toluca','Morelia','Cuernavaca','Tepic','Monterrey','Oaxaca de Juárez','Puebla','Querétaro','Chetumal','San Luis Potosí','Culiacán','Hermosillo','Villahermosa','Victoria','Tlaxcala','Xalapa','Mérida','Zacatecas','Cuauhtémoc']

In [5]:
#Función que establece si una ciudad se encuentra en la capital del estado
def EstaEnCapital(Ciudad):
    for capital in capitales:
        if(Ciudad==capital):
            return 1
    return 0

In [6]:
train['Esta_Capital']=train['ciudad'].apply(EstaEnCapital)

In [7]:
train = train.drop('id', axis=1)
train = train.drop('descripcion', axis=1)
train = train.drop('direccion', axis=1)
train = train.drop('lat', axis=1)
train = train.drop('lng', axis=1)

In [8]:
train[['antiguedad']] = train[['antiguedad']].fillna(value=8.116114)
train[['habitaciones']] = train[['habitaciones']].fillna(value=2.902326)
train[['garages']] = train[['garages']].fillna(value=1.546874)
train[['banos']] = train[['banos']].fillna(value=2.132417)
train[['metroscubiertos']] = train[['metroscubiertos']].fillna(value=174.016774)
train[['metrostotales']] = train[['metrostotales']].fillna(value=176.765145)
train[['idzona']] = train[['idzona']].fillna(value=2141183.096329)

In [9]:
def oneHotEncoding(df, columna):
    one_hot = pd.get_dummies(df[columna])
    df.drop(columna, axis=1, inplace=True)
    df = pd.concat([df, one_hot], axis=1)
    return df

In [10]:
train = oneHotEncoding(train,'provincia')
train = oneHotEncoding(train,'tipodepropiedad')

train = train.drop('Garage', axis=1)
train = train.drop('Hospedaje', axis=1)

In [11]:
#Hacemos la conversión de fechas
train['Fecha']=pd.to_datetime(train['fecha'])
train.drop(columns={'fecha'},inplace=True)

train['Anio']=train['Fecha'].dt.year
train = train.drop('Fecha', axis=1)

In [12]:
target_enc = ce.TargetEncoder(train['ciudad'])

# Fit the encoder using the categorical features and target
target_enc.fit(train['ciudad'], train['precio'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train['ciudad']).add_suffix('_target'))

train = train.drop('ciudad', axis=1)

In [13]:
trainAux = train.copy()
trainAux.dropna(subset=["titulo"],inplace=True)

titulos = trainAux["titulo"]
del trainAux

In [14]:
spanish_stopwords = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')

tipos = ['casa','apartamento','condominio','terreno','local','oficina','bodega','edificio','comercial','quinta','duplex','rancho','departamento','nave','industrial','lote','hospedaje','garage','venta','vende','mexico','dept','departamento','Casa','dept.','departamentos','Mexico','vendo','depto','depto.']
spanish_stopwords.extend(tipos)

In [15]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [16]:
palabras=[]
for row in titulos:
    row =row.lower()
    row=filter(None, re.split("[, \-!?:._¡¿'`<>;()+]", row))
    for palabra in row:
        if len(palabra)<4: #saco conectores
            continue
        if palabra not in spanish_stopwords:
            palabras.append(palabra)

In [17]:
stems = stem_tokens(palabras, stemmer)
contador = Counter(stems)
palabrasComunes = contador.most_common(60)

In [18]:
columnas_nuevas = []
for elemento in palabrasComunes:
    columnas_nuevas.append(elemento[0])

In [19]:
del titulos
del palabras
del contador
del palabrasComunes

In [20]:
for columna in columnas_nuevas:
    train[columna]=train.titulo.str.lower().str.contains(columna)

In [21]:
train = train.fillna(value=0)
train = train.drop('titulo', axis=1)

In [22]:
train_label = []
train_data = []

train_label = train.iloc[:, 12]
train_data = train.iloc[:, train.columns != 'precio']

In [23]:
test = pd.read_csv('test.csv')
IDS=test[['id']].copy()

In [24]:
test['Esta_Capital']=test['ciudad'].apply(EstaEnCapital)

In [25]:
test = test.drop('id', axis=1)
test = test.drop('descripcion', axis=1)
test = test.drop('direccion', axis=1)
test = test.drop('lat', axis=1)
test = test.drop('lng', axis=1)

In [26]:
test[['antiguedad']] = test[['antiguedad']].fillna(value=8.116114)
test[['habitaciones']] = test[['habitaciones']].fillna(value=2.902326)
test[['garages']] = test[['garages']].fillna(value=1.546874)
test[['banos']] = test[['banos']].fillna(value=2.132417)
test[['metroscubiertos']] = test[['metroscubiertos']].fillna(value=174.016774)
test[['metrostotales']] = test[['metrostotales']].fillna(value=176.765145)
test[['idzona']] = test[['idzona']].fillna(value=2141183.096329)

In [27]:
test = oneHotEncoding(test,'provincia')
test = oneHotEncoding(test,'tipodepropiedad')

In [28]:
#Hacemos la conversión de fechas
test['Fecha']=pd.to_datetime(test['fecha'])
test.drop(columns={'fecha'},inplace=True)

test['Anio'] = test['Fecha'].dt.year
test = test.drop('Fecha', axis=1)

In [29]:
test = test.join(target_enc.transform(test['ciudad']).add_suffix('_target'))
test = test.drop('ciudad', axis=1)

In [30]:
for columna in columnas_nuevas:
    test[columna]=test.titulo.str.lower().str.contains(columna)

In [31]:
test = test.fillna(value=0)
test = test.drop('titulo', axis=1)

In [32]:
def write_submission(test_data, prediction, file_output):
    
    archivo_entrada = open(test_data)
    entrada_csv = csv.reader(archivo_entrada)
    next(entrada_csv, None)  # skip the headers

    archivo_salida = open(file_output, 'w')
    submit_csv = csv.writer(archivo_salida)
    submit_csv.writerow(['id', 'target'])

    for reg1, reg2  in zip(entrada_csv, prediction):
        linea = [reg1[0], round(reg2, 2)]
        submit_csv.writerow(linea)
    archivo_salida.close()

In [33]:
#DECISION TREE

In [34]:
tree = DecisionTreeRegressor(criterion='mse',random_state=1)

In [35]:
tree.fit(train_data, train_label)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

In [36]:
prediccion_tree_validacion=tree.predict(test)

#print('Mean Absolute Error: ',mean_absolute_error(test['precio'],prediccion_tree_validacion))

In [37]:
entregable_DTR=IDS[['id']].copy()
entregable_DTR['target']=prediccion_tree_validacion
entregable_DTR.set_index('id',inplace=True)
entregable_DTR['target']=entregable_DTR['target'].round()
entregable_DTR.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,6900000.0
51775,949000.0
115253,2615000.0
299321,1150000.0
173570,580000.0


In [38]:
entregable_DTR.to_csv('DTR01.csv')

In [39]:
params = [ {'n_estimators': [250, 280, 300, 350]},]

modeloRF = RandomForestRegressor(random_state=0, n_jobs=-1)
grid = GridSearchCV(modeloRF, params, n_jobs=-1)
grid.fit(train_data, train_label)

acc = grid.score(train_data, train_label)
print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
print("[INFO] grid search best parameters: {}".format(grid.best_params_))



[INFO] grid search accuracy: 97.35%
[INFO] grid search best parameters: {'n_estimators': 300}


In [40]:
random = RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=300)

In [41]:
random.fit(train_data, train_label)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
                      oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [42]:
prediccion_random_validacion=random.predict(test)

#print('Mean Absolute Error: ',mean_absolute_error(precio_test,prediccion_random_validacion))

In [43]:
entregable_random=IDS[['id']].copy()
entregable_random['target']=prediccion_random_validacion
entregable_random.set_index('id',inplace=True)
entregable_random['target']=entregable_random['target'].round()
entregable_random.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,6641800.0
51775,950275.0
115253,2446691.0
299321,1430394.0
173570,574946.0


In [44]:
entregable_random.to_csv('RANDOMF_01.csv')