In [15]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

In [16]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import numpy as np

In [17]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style = "whitegrid")

In [18]:
from sklearn.preprocessing import StandardScaler, LabelBinarizer, MinMaxScaler, PolynomialFeatures 
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
def pred_vs_test(y_test, y_pred, r2):
    plt.plot(y_test, y_test, '-.', c='grey')
    plt.scatter(y_pred, y_test, s=30, c='r', marker='+', zorder=10)
    plt.xlabel("Predicciones / R2 ={:.3f}".format(r2))
    plt.ylabel("Valores reales")

#### Lectura del dataset

In [19]:
df = pd.read_csv('palermo_deptos_final.csv')
df.drop(['Unnamed: 0', 'id', 'expensas', 'piso', 'descripcion', 'geometry', 'cubierta'], axis=1, inplace=True)
df.head(2)

Unnamed: 0,superficie,precio,ambientes,pileta,seguridad,parrilla,balcon,cochera,dist_comisarias,dist_subte,dist_trenes,dist_metrobus,antiguedad,barrio
0,104.0,3365.384615,3,0,0,0,1,0,719.300376,991.604946,794.459912,1500.489635,usado,Norte
1,68.0,3455.882353,2,1,0,0,0,0,980.716124,906.891789,260.100239,876.358223,estrenar,Las Cañitas


### Agregar variables dummy - Discretización

In [20]:
barrio_dummy = pd.get_dummies(df.barrio, drop_first=True, dtype='int')
antiguedad_dummy = pd.get_dummies((df.antiguedad), drop_first=True, dtype='int')

In [21]:
df = pd.concat([df, barrio_dummy], axis=1)
df = pd.concat([df, antiguedad_dummy], axis=1)

In [22]:
df.drop(['barrio', 'antiguedad'], axis=1, inplace=True)

In [23]:
df.head(2)
df.info()
df.columns

Unnamed: 0,superficie,precio,ambientes,pileta,seguridad,parrilla,balcon,cochera,dist_comisarias,dist_subte,...,Freud,Hollywood,Las Cañitas,Norte,Nuevo,Pacifico,Soho,Zoo,pozo,usado
0,104.0,3365.384615,3,0,0,0,1,0,719.300376,991.604946,...,0,0,0,1,0,0,0,0,0,1
1,68.0,3455.882353,2,1,0,0,0,0,980.716124,906.891789,...,0,0,1,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1770 entries, 0 to 1769
Data columns (total 25 columns):
superficie         1770 non-null float64
precio             1770 non-null float64
ambientes          1770 non-null int64
pileta             1770 non-null int64
seguridad          1770 non-null int64
parrilla           1770 non-null int64
balcon             1770 non-null int64
cochera            1770 non-null int64
dist_comisarias    1770 non-null float64
dist_subte         1770 non-null float64
dist_trenes        1770 non-null float64
dist_metrobus      1770 non-null float64
Botanico           1770 non-null int64
Chico              1770 non-null int64
Falso Belgrano     1770 non-null int64
Freud              1770 non-null int64
Hollywood          1770 non-null int64
Las Cañitas        1770 non-null int64
Norte              1770 non-null int64
Nuevo              1770 non-null int64
Pacifico           1770 non-null int64
Soho               1770 non-null int64
Zoo                1770

Index(['superficie', 'precio', 'ambientes', 'pileta', 'seguridad', 'parrilla',
       'balcon', 'cochera', 'dist_comisarias', 'dist_subte', 'dist_trenes',
       'dist_metrobus', 'Botanico', 'Chico', 'Falso Belgrano', 'Freud',
       'Hollywood', 'Las Cañitas', 'Norte', 'Nuevo', 'Pacifico', 'Soho', 'Zoo',
       'pozo', 'usado'],
      dtype='object')

In [24]:
corr_mtx = df.corr()
features = abs(corr_mtx['precio']).sort_values(ascending=False)[1:]
features

pileta             0.400525
cochera            0.365194
seguridad          0.301327
Nuevo              0.266726
dist_trenes        0.179184
parrilla           0.158649
Freud              0.152822
dist_comisarias    0.138299
Soho               0.126024
Las Cañitas        0.115540
Falso Belgrano     0.104782
Norte              0.095903
usado              0.095859
Pacifico           0.091958
superficie         0.089334
Chico              0.084194
Zoo                0.070561
ambientes          0.068026
balcon             0.066479
pozo               0.037106
Botanico           0.036671
dist_metrobus      0.031962
Hollywood          0.029600
dist_subte         0.004390
Name: precio, dtype: float64

In [25]:
# Reordeno columnas en orden descendente por corr con 'precio'
X = df.loc[:, features.index]
y = df['precio']

### Evaluación modelo RidgeRegression

In [26]:
def evaluate_rg(X, y, degree=1, scaling='standard', splits=5, alphas=None, random_state=18):
    """
    inputs: 
    X, y
    degree = grado para las Polynomial Features
    scaling = standard o minmax
    splits = cantidad de kfolds
    alphas = listado de alphas a testear. None (default seleccionado por algoritmo)
    random_state
    
    output:
    df con scores y coefficients
    """
    
    # split Train-Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    
    # scaling
    if scaling == 'minmax':
        minmax = MinMaxScaler()
        X_train_scaled = minmax.fit_transform(X_train)
        X_test_scaled = minmax.transform(X_test)
    else:
        se = StandardScaler()
        X_train_scaled = se.fit_transform(X_train)
        X_test_scaled = se.transform(X_test)
    
    # polynomial features
    poly = PolynomialFeatures(degree = degree)
    X_train_scaled_poly = poly.fit_transform(X_train_scaled)
    X_test_scaled_poly = poly.transform(X_test_scaled)
    
    # shuffling
    kf = KFold(n_splits=splits, shuffle=True, random_state=random_state)
    
    # train model
    model = RidgeCV(alphas=alphas, cv=kf)
    model.fit(X_train_scaled_poly, y_train)
    
    # intercept and coefficients
    intercepto = model.intercept_
    coefficients = model.coef_
    
    # predict
    y_pred = model.predict(X_test_scaled_poly)
                   
    # return dataframe with results
    results = {
                'r2': r2_score(y_test, y_pred),
                'mae': mean_absolute_error(y_test, y_pred), 
                'rsme': np.sqrt(mean_squared_error(y_test, y_pred)),
                'features': X.columns.values,
                'degree': degree,
                'scaling': scaling,
                'alpha': model.alpha_,
                'intercept': model.intercept_,
                'coefficients': model.coef_,
              }
    return results

In [52]:
ridge_models = []
alphas_to_check = [x/10.0 for x in range(3000)]
for scal in ['standard', 'minmax']:    
    for deg in [1, 2]:    
            res = evaluate_rg(X, y, degree=deg, scaling=scal, alphas=alphas_to_check)
            ridge_models.append(res)
ridge_models_df = pd.DataFrame(ridge_models)

In [53]:
best_ridge_models = ridge_models_df.loc[:, ['r2','features', 'degree', 'alpha', 'scaling']].sort_values(by='r2', ascending=False)

In [54]:
best_ridge_models.head()

Unnamed: 0,r2,features,degree,alpha,scaling
1,0.468549,"[pileta, cochera, seguridad, Nuevo, dist_trenes, parrilla, Freud, dist_comisarias, Soho, Las Cañitas, Falso Belgrano, Norte, usado, Pacifico, superficie, Chico, Zoo, ambientes, balcon, pozo, Botanico, dist_metrobus, Hollywood, dist_subte]",2,299.9,standard
3,0.418986,"[pileta, cochera, seguridad, Nuevo, dist_trenes, parrilla, Freud, dist_comisarias, Soho, Las Cañitas, Falso Belgrano, Norte, usado, Pacifico, superficie, Chico, Zoo, ambientes, balcon, pozo, Botanico, dist_metrobus, Hollywood, dist_subte]",2,8.2,minmax
0,0.341004,"[pileta, cochera, seguridad, Nuevo, dist_trenes, parrilla, Freud, dist_comisarias, Soho, Las Cañitas, Falso Belgrano, Norte, usado, Pacifico, superficie, Chico, Zoo, ambientes, balcon, pozo, Botanico, dist_metrobus, Hollywood, dist_subte]",1,33.4,standard
2,0.339006,"[pileta, cochera, seguridad, Nuevo, dist_trenes, parrilla, Freud, dist_comisarias, Soho, Las Cañitas, Falso Belgrano, Norte, usado, Pacifico, superficie, Chico, Zoo, ambientes, balcon, pozo, Botanico, dist_metrobus, Hollywood, dist_subte]",1,1.0,minmax
