# Modelo 01

<hr>
## Cleaning and Processing Machine Learning data

In [44]:
import pandas as pd
import os
import json


In [51]:
# houses = json.load(open('../../EDOMEX_houses.json'))
# apartments = json.load(open('../../EDOMEX_apartments.json'))

In [52]:
df_houses = pd.read_json('../../EDOMEX_houses.json')
df_apartments = pd.read_json('../../EDOMEX_apartments.json')

In [77]:
df_complete = df_houses.append(df_apartments)
# df_complete

In [78]:
df_complete = df_complete[df_complete['currency']=="MXN"]

### Split data Train and Test

In [79]:
from sklearn.model_selection import train_test_split

# Read data
X_full = df_complete
# X_test_full = pd.read_csv("")

# Obtain Target and Predictions
y = X_full.price

features = ['type_of_prop',
       'Estado', 'Ciudad', 'Colonia', 'Superficie total',
       'Superficie construida', 'Ambientes', 'Recamaras', 'Banos',
       'Estacionamientos', 'Antiguedad', 'Cantidad de pisos',
       'Cuota mensual de mantenimiento', 'Bodegas']

X = X_full[features]
# X_test = X_test_full[features]

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


In [80]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

### Categorical Variables

In [81]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['type_of_prop', 'Estado', 'Ciudad', 'Colonia']


### One Hot Encoding

In [82]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


### Goodbye NAs 

In [83]:
OH_X_train=OH_X_train.fillna(0)
OH_X_valid=OH_X_valid.fillna(0)

print("No more NAs :D")

No more NAs :D


In [84]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=OH_X_train, X_v=OH_X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 1331317
Model 2 MAE: 1315987
Model 3 MAE: 1277496
Model 4 MAE: 1417299
Model 5 MAE: 1348177
