In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('../data/train.csv')

print('Total Original: ', df_train.shape[0])

Total Original:  240000


### Feature Engineering

In [3]:
# Agrego datos de densidad poblacional y criminalidad a las provincias

provincia_info = {
    'provincia': ["Aguascalientes", "Baja California Norte", "Baja California Sur", "Campeche", "Coahuila", "Colima", "Chiapas", "Chihuahua", "Distrito Federal", "Durango", "Guanajuato", "Guerrero", "Hidalgo", "Jalisco", "Edo. de México", "Michoacán", "Morelos", "Nayarit", "Nuevo León", "Oaxaca", "Puebla", "Querétaro", "Quintana Roo", "San luis Potosí", "Sinaloa", "Sonora", "Tabasco", "Tamaulipas", "Tlaxcala", "Veracruz", "Yucatán", "Zacatecas"],
    'densidadpoblacional': [233.7, 46.4, 9.6, 15.6, 19.5, 126.4, 71.2, 14.4, 5967.3, 14.2, 191.3, 55.6, 137.3, 99.8, 724.2, 78.2, 390.2, 42.4, 79.8, 42.3, 179.8, 174.4, 33.6, 44.5, 51.7, 15.9, 96.9, 42.9, 318.4, 113.0, 53.1, 21.0],
    'criminalidad': [35.457, 32.758, 25.577, 22.114, 24.800, 27.045, 16.687, 31.274, 52.718, 25.640, 33.154, 53.875, 21.159, 49.317, 56.835, 23.876, 43.419, 21.288, 26.221, 24.961, 27.530, 30.991, 35.639, 25.838, 22.750, 40.466, 30.409, 21.363, 30.699, 22.157, 25.862, 21.501]
}

df_provincia_info = pd.DataFrame(provincia_info, columns = ['provincia', 'densidadpoblacional', 'criminalidad'])

df_train = pd.merge(df_train, df_provincia_info, on = "provincia")


# Creo la columna comodidades usando el campo descripcion

#from collections import Counter
#Counter(" ".join(df_train["descripcion"].apply(lambda x: str(x))).split()).most_common(100)

def amenitiesCounter(descripcion):
    words = descripcion.lower().split()
    
    amenities = [
        'estufa' in words or 'estufas' in words,
        'aire' in words and 'acondicionado' in words,
        'balcón' in words or 'balcon' in words,
        'asador' in words,
        'jardin' in words or 'jardín' in words,
        'jacuzzi' in words,
        'amplio' in words or 'amplia' in words,
        'sol' in words,
        'vista' in words,
        'mueble' in words or 'muebles' in words,
        'vestidor' in words,
        'vigilancia' in words,
        'cisterna' in words
    ]
        
    return len([ x for x in amenities if x])
    
df_train['comodidades'] = df_train['descripcion'].apply(lambda x: amenitiesCounter(str(x)))


print("Porcentaje de inmuebles con alguna comodidad: ", df_train[df_train['comodidades'] != 0].dropna().shape[0] / df_train.shape[0])

Porcentaje de inmuebles con alguna comodidad:  0.09301423836227564


### Segmentacion

In [4]:
# Filtro por tipo de propiedad
df_train = df_train[df_train['tipodepropiedad'] == 'Casa'].dropna()

print('Solo casas: ', df_train.shape[0])

Solo casas:  32816


In [5]:
#df_train.provincia.value_counts()

In [6]:
# Filtro por provincias"
#provincias = ["Aguascalientes", "Baja California Norte", "Baja California Sur", "Campeche", "Coahuila", "Colima", "Chiapas", "Chihuahua", "Distrito Federal", "Durango", "Guanajuato", "Guerrero", "Hidalgo", "Jalisco", "Edo. de México", "Michoacán", "Morelos", "Nayarit", "Nuevo León", "Oaxaca", "Puebla", "Querétaro", "Quintana Roo", "San luis Potosí", "Sinaloa", "Sonora", "Tabasco", "Tamaulipas", "Tlaxcala", "Veracruz", "Yucatán", "Zacatecas"]
provincias = ['Colima', 'Michoacán', 'Guerrero', 'Oaxaca', 'Chiapas']
df_train = df_train[df_train['provincia'].isin(provincias)]

print('Algunas provincias: ', df_train.shape[0])

Algunas provincias:  899


In [7]:
# Filtro por precio
lower = np.percentile(df_train['precio'], 10)
upper = np.percentile(df_train['precio'], 90)

df_train = df_train[df_train['precio'].between(lower, upper)]

print('Precios acotados: ', df_train.shape[0])

Precios acotados:  727


In [8]:
# Filtro por metros
lower = np.percentile(df_train['metrostotales'], 10)
upper = np.percentile(df_train['metrostotales'], 90)

df_train = df_train[df_train['metrostotales'].between(lower, upper)]

print('Metros totales acotados: ', df_train.shape[0])

Metros totales acotados:  596


In [9]:
# Filtro por habitaciones
df_train = df_train[df_train['habitaciones'].between(1, 4)]

print('Densidad poblacional acotada: ', df_train.shape[0])

Densidad poblacional acotada:  569


In [10]:
# Filtro por antiguedad
df_train = df_train[df_train['antiguedad'].between(0, 20)]

print('Antiguedad acotada: ', df_train.shape[0])

Antiguedad acotada:  558


In [11]:
print("Porcentaje de inmuebles segmentados con alguna comodidad: ", df_train[df_train['comodidades'] != 0].dropna().shape[0] / df_train.shape[0])

Porcentaje de inmuebles segmentados con alguna comodidad:  0.48028673835125446


### Creo CSVs con los datos a usar en los experimentos

In [19]:
#Segmento por antiguedad
middle = np.percentile(df_train['antiguedad'], 50)

df_train_new_houses = df_train[df_train['antiguedad'].between(0, middle)]
df_train_old_houses = df_train[df_train['antiguedad'].between(middle, 20)]

df_train_new_houses.sample(frac=1, random_state=1).to_csv('../data/train-new-houses.csv', index=False)
df_train_old_houses.sample(frac=1, random_state=1).to_csv('../data/train-old-houses.csv', index=False)

In [20]:
df_train.sample(frac=1, random_state=1).to_csv('../data/train-all-houses.csv', index=False)