# Modelo 01

<hr>
## Cleaning and Processing Machine Learning data

In [44]:
import pandas as pd
import os
import json

In [51]:
# houses = json.load(open('../../EDOMEX_houses.json'))
# apartments = json.load(open('../../EDOMEX_apartments.json'))

In [52]:
df_houses = pd.read_json('../../EDOMEX_houses.json')
df_apartments = pd.read_json('../../EDOMEX_apartments.json')

In [77]:
df_complete = df_houses.append(df_apartments)
# df_complete

In [78]:
df_complete = df_complete[df_complete['currency']=="MXN"]

## Split data Train and Test

In [87]:
from sklearn.model_selection import train_test_split

# Read data
X_full = df_complete
# X_test_full = pd.read_csv("")

# Obtain Target and Predictions
y = X_full.price

features = ['type_of_prop',
       'Estado', 'Ciudad', 'Colonia', 'Superficie total',
       'Superficie construida', 'Ambientes', 'Recamaras', 'Banos',
       'Estacionamientos', 'Antiguedad', 'Cantidad de pisos',
       'Cuota mensual de mantenimiento', 'Bodegas']

X = X_full[features]
# X_test = X_test_full[features]

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


In [80]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

## 1) Cleaning

We require the transformation of Categorical Variables to feed the model, we will use One Hot Encoder because the variables we need to transform do not have an ordinal rank, and we don´t need to identify a hierarchy level between houses and apartments, for example. 

### Categorical Variables

In [81]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['type_of_prop', 'Estado', 'Ciudad', 'Colonia']


### One Hot Encoding

In [82]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


### Goodbye NAs 

In [146]:
OH_X_train=OH_X_train.fillna(0)
OH_X_valid=OH_X_valid.fillna(0)

print("No more NAs, now just 0s :D")

No more NAs, now just 0s :D


In [84]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=OH_X_train, X_v=OH_X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 1331317
Model 2 MAE: 1315987
Model 3 MAE: 1277496
Model 4 MAE: 1417299
Model 5 MAE: 1348177


## 2) Cross Validation

Cross Validation is used to improve the score measures of our model by running the same process on different subsets of data to get multiple scores, or in other workds, desing different scenarios in order to use the 100% of the dataset as a test or validation data. This method should be used if we have a relative small data set.

In [92]:
# Creating a Pipeline

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps=[('model', RandomForestRegressor(n_estimators=50,
                                                              random_state=0))
                             ])

In [137]:
from sklearn.model_selection import cross_val_score

# Get list of categorical variables
s = (X.dtypes == 'object')
object_cols = list(s[s].index)

# Encoding Categorical Variables with OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_X_pandas = pd.get_dummies(X)

# OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
# OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[object_cols]))
# One-hot encoding removed index; put it back
# OH_cols.index = X.index
# Remove categorical columns (will replace with one-hot encoding)
# num_X = X.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
# OH_X = pd.concat([num_X, OH_cols], axis=1)

# Verify if NA cleaning can be inside the pipeline
# OH_X=OH_X.fillna(0)

def crossed_score_model(model, X, y):
    
    my_pipeline = Pipeline(steps=[('model', model)])
    
    # Multiply by -1 since sklearn calculates *negative* MAE
    scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
#     print("MAE scores:\n", scores)
#     print(scores.mean())
    return scores.mean()

for i in range(0,len(models)):
    cross_score=crossed_score_model(models[i],OH_X_pandas, y)
    print("Model %d Avg Cross Score: %d" % (i+1, cross_score))
    

Model 1 Avg Cross Score: 1628782
Model 2 Avg Cross Score: 1639069
Model 3 Avg Cross Score: 1640996
Model 4 Avg Cross Score: 1611346
Model 5 Avg Cross Score: 1644486


## 3) XG Boost

Gradient boosting is a method that goes through cycles to iteratively add models into an ensemble.

It begins by initializing the ensemble with a single model, whose predictions can be pretty naive. (Even if its predictions are wildly inaccurate, subsequent additions to the ensemble will address those errors.)

In [145]:
from xgboost import XGBRegressor
X_valid 

xg_model = XGBRegressor()
xg_model.fit(OH_X_train, y_train)

from sklearn.metrics import mean_absolute_error

# Get predictions
predictions = xg_model.predict(OH_X_valid) 
mae = mean_absolute_error(predictions, y_valid) 

print(mae)

1473643.119047619


In [148]:
print("MAE still very high :(")

MAE still very high :(
