In [1]:
import warnings
warnings.filterwarnings('ignore')

## train y test split

In [12]:
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
path_dataset = '../csv/datos_properati_limpios_model.csv'
df = pd.read_csv(path_dataset)
X = df.drop(['price_aprox_usd'], axis=1)
y = df['price_aprox_usd']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
X_train.shape

(4782, 58)

## Eligiendo un modelo

In [6]:
# Desde scikit learn
from sklearn.linear_model import LinearRegression 
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
from sklearn.linear_model import SGDRegressor 
sgd_reg = SGDRegressor()
sgd_reg.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [8]:
from sklearn.tree import DecisionTreeRegressor 
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [9]:
# Calculamos el rmse en el conjunto de entrenamiento 
lin_reg_predict = lin_reg.predict(X_train)
sgd_reg_predict = sgd_reg.predict(X_train)
tree_reg_predict = tree_reg.predict(X_train)

In [10]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_train, lin_reg_predict)
sgd_mse = mean_squared_error(y_train, sgd_reg_predict)
tree_mse = mean_squared_error(y_train, tree_reg_predict)

In [13]:
print("RMSE Entrenamiento: ", np.sqrt(lin_mse), np.sqrt(sgd_mse), np.sqrt(tree_mse))

RMSE Entrenamiento:  27064.519458431754 1.3494792885467549e+17 444.4881391458601


In [14]:
# Calculamos el rmse en el conjunto de test 
lin_reg_predict = lin_reg.predict(X_test)
sgd_reg_predict = sgd_reg.predict(X_test)
tree_reg_predict = tree_reg.predict(X_test)

In [15]:
lin_mse = mean_squared_error(y_test, lin_reg_predict)
sgd_mse = mean_squared_error(y_test, sgd_reg_predict)
tree_mse = mean_squared_error(y_test, tree_reg_predict)

In [16]:
print("RMSE Test: ", np.sqrt(lin_mse), np.sqrt(sgd_mse), np.sqrt(tree_mse))

RMSE Test:  27482.455263162654 1.6598190487961208e+16 26041.29187603573


## Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score
# Indicamos que queremos hacer cross validation con 10 cortes. 
# Es decir vamos a iterar 10 veces y obtener los scores de cada iteración
scores = cross_val_score(tree_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)

In [18]:
rmse_scores = np.sqrt(-scores)

In [19]:
print("Scores: ", rmse_scores)
print("Promedio: ", rmse_scores.mean())
print("Desvío estandar: ", rmse_scores.std())

Scores:  [25339.22453217 27243.32809096 25223.16724183 26732.70099201
 26947.81912392 23558.49916086 25779.91505229 26263.47506927
 24570.37715809 26466.08160685]
Promedio:  25812.4588028271
Desvío estandar:  1095.8690296754773


In [20]:
scores = cross_val_score(lin_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)


In [21]:
print("Scores: ", rmse_scores)
print("Promedio: ", rmse_scores.mean())
print("Desvío estandar: ", rmse_scores.std())

Scores:  [ 27468.93645461 114554.12196814  28443.67000458  26749.37745925
  25923.27791784  26404.37832903  27493.83976055  27135.11648769
  27972.84446872  27882.4484708 ]
Promedio:  36002.80113211845
Desvío estandar:  26193.67912104112
