In [1]:
import pandas as pd
import numpy as np

## Regresi√≥n capital

In [2]:
def load_dataset(orig = False):
    if orig:
        return pd.read_csv("properati.csv")
    return pd.read_csv("properati_clean.csv")

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

import seaborn as sns
import matplotlib.pyplot as plt

palette = sns.color_palette("coolwarm", 2)

def train_test_error(dicotomical, categorical, numerical, target, Estimator = LinearRegression):
    # Filter to only check for capital federal
    data = load_dataset()
    mask = data.state_name == 'Capital Federal'
    cluster = 'place_name'
    data = data.loc[mask]
    
    # Append dummy columns
    for column in dicotomical + categorical:
        dummies = pd.get_dummies(data[column], prefix = column, drop_first = True)
        data = data.join(dummies)
    
    dicotomical_dummy_cols = [name + '_True' for name in dicotomical]
    categorical_dummy_cols = [column for name in categorical for column in data.columns if name + '_' in column]
    
    feature_cols = dicotomical_dummy_cols + categorical_dummy_cols + numerical
    
    data = data.loc[:, feature_cols + [target, cluster]].dropna()
    index_train, index_test = train_test_split(data.index, stratify = data[cluster])
    
    columns_to_scale = numerical
    scaler = StandardScaler()
    scaler.fit(data.loc[index_train, columns_to_scale])
    scaled_matrix = scaler.transform(data.loc[:, columns_to_scale])
    scaled_columns = [column + '_scaled' for column in columns_to_scale]
    scaled_dataframe = pd.DataFrame(scaled_matrix, columns = scaled_columns, index = data.index)
    
    data = data.join(scaled_dataframe)
    
    feature_cols_scaled = dicotomical_dummy_cols + categorical_dummy_cols + scaled_columns
    
    X = data.loc[:, feature_cols_scaled]
    y = data.loc[:, [target]]
    
    X_train, X_test, y_train, y_test = X.loc[index_train], X.loc[index_test], y.loc[index_train], y.loc[index_test]
    
    model = Estimator()
    model.fit(X_train, y_train)
    

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    train_mae = mae(y_train, y_pred_train)
    test_mae = mae(y_test, y_pred_test)
    
    train_mse = mse(y_train, y_pred_train)
    test_mse = mse(y_test, y_pred_test)
    
    print("Train R2:", train_r2)
    print("Train MAE:", train_mae)
    print("Train MSE:", train_mse)
    print()
    print("Test R2:", test_r2)
    print("Test MAE:", test_mae)
    print("Test MSE:", test_mse)
    
    sns.barplot(x = [train_r2, test_r2], y = ['Train R2', 'Test R2'], palette = palette)
    plt.show()
    sns.barplot(x = [train_mse, test_mse], y = ['Train MSE', 'Test MSE'], palette = palette)
    plt.show()
    sns.barplot(x = [train_mae, test_mae], y = ['Train MAE', 'Test MAE'], palette = palette)

In [4]:
train_test_error(
    [
        'pileta',
        'galeria',
        'seguridad',
        'sotano',
        'amenities',
        'terraza',
        'metrobus',
        'subte',
        'cochera',
        'consultorio'
    ],
    [
        'property_type',
        'place_name',
    ],
    [
        'surface_covered_in_m2',
        'surface_total_in_m2',
        'rooms_3'
    ],
    'price_aprox_usd'
)

NameError: name 'cf' is not defined

In [None]:
train_test_error(
    [
        'pileta',
        'galeria',
        'seguridad',
        'sotano',
        'amenities',
        'terraza',
        'metrobus',
        'subte',
        'cochera',
        'consultorio'
    ],
    [
        'property_type',
        'place_name',
    ],
    [
        'surface_covered_in_m2',
        'surface_total_in_m2',
        'rooms_3'
    ],
    'price_aprox_usd',
    Estimator = Lasso
)

In [None]:
train_test_error(
    [
        'pileta',
        'galeria',
        'seguridad',
        'sotano',
        'amenities',
        'terraza',
        'metrobus',
        'subte',
        'cochera',
        'consultorio'
    ],
    [
        'property_type',
        'place_name',
    ],
    [
        'surface_covered_in_m2',
        'surface_total_in_m2',
        'rooms_3'
    ],
    'price_aprox_usd',
    Estimator = Ridge
)

In [None]:
train_test_error(
    [
        'pileta',
        'galeria',
        'seguridad',
        'sotano',
        'amenities',
        'terraza',
        'metrobus',
        'subte',
        'cochera',
        'consultorio'
    ],
    [
        'property_type',
        'place_name',
    ],
    [
        'surface_covered_in_m2',
        'surface_total_in_m2',
        'rooms_3'
    ],
    'price_usd_per_m2'
)

In [None]:
train_test_error(
    [
        'pileta',
        'galeria',
        'seguridad',
        'sotano',
        'amenities',
        'terraza',
        'metrobus',
        'subte',
        'cochera',
        'consultorio'
    ],
    [
        'property_type',
        'place_name',
    ],
    [
        'surface_covered_in_m2',
        'surface_total_in_m2',
        'rooms_3'
    ],
    'price_usd_per_m2',
    Estimator = Lasso
)

In [None]:
train_test_error(
    [
        'pileta',
        'galeria',
        'seguridad',
        'sotano',
        'amenities',
        'terraza',
        'metrobus',
        'subte',
        'cochera',
        'consultorio'
    ],
    [
        'property_type',
        'place_name',
    ],
    [
        'surface_covered_in_m2',
        'surface_total_in_m2',
        'rooms_3'
    ],
    'price_usd_per_m2',
    Estimator = Ridge
)