## 4. Predicting Price with Size, Location, and Neighborhood

In [1]:
# ! pip install seaborn

In [2]:
# ! pip install ipywidgets

In [3]:
from glob import glob

import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from ipywidgets import interact, IntSlider, FloatSlider, Dropdown # Interactive dashboard 
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

### Import

In [4]:
def wrangle(filepath):
    df = pd.read_excel(filepath)

    # Subset data: houses 
    mask_house = df["property_type"] == "house"

    df = df[mask_house]

    # Remove outliers in "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon"
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand = True).astype(float)
    df.drop(columns = "lat-lon", inplace = True) # Drop the old column 

    # Extract neighborhood
    df["neighborhood"] = df["place_with_parent_names"].str.split("|", expand=True)[3]
    df.drop(columns = "place_with_parent_names", inplace = True)

    # Drop features with high null counts
    df.drop(columns = ["floor","expenses"], inplace = True)

    # Drop low, high cardinality categorical variables
    df.drop(columns = ["operation", "property_type", "currency"], inplace=True)

    # Drop leaky columns 
    df.drop(columns=["price", "price_aprox_local_currency", "price_per_m2", "price_usd_per_m2"],
            inplace = True)
    
    # Drop multicollinearity 
    df.drop(columns = ["surface_total_in_m2", "rooms"], inplace=True)

    # Drop n/a 
    df.dropna(inplace=True)

    return df 

In [5]:
files = glob("data/real-estate-*.xlsx")
files

['data/real-estate-1.xlsx',
 'data/real-estate-5.xlsx',
 'data/real-estate-4.xlsx',
 'data/real-estate-3.xlsx',
 'data/real-estate-2.xlsx']

In [6]:
# List comprehension 
frames = [wrangle(file) for file in files]

In [7]:
df = pd.concat(frames, ignore_index=True)
print(df.shape)
print(df.info())
df.head()

(6713, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6713 entries, 0 to 6712
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   price_aprox_usd        6713 non-null   float64
 1   surface_covered_in_m2  6713 non-null   float64
 2   lat                    6713 non-null   float64
 3   lon                    6713 non-null   float64
 4   neighborhood           6713 non-null   object 
dtypes: float64(4), object(1)
memory usage: 262.4+ KB
None


Unnamed: 0,price_aprox_usd,surface_covered_in_m2,lat,lon,neighborhood
0,410000.0,220.0,-34.6497,-58.658073,Morón
1,180000.0,135.0,-34.595709,-58.56695,Tres de Febrero
2,130000.0,140.0,-34.655209,-58.69732,Ituzaingó
3,230000.0,230.0,-34.652173,-58.577867,La Matanza
4,340000.0,200.0,-34.417629,-58.645755,Tigre


### Split data

In [8]:
target = "price_aprox_usd"
features = df.columns[df.columns != target].tolist()

y_train = df[target]
X_train = df[features]

### Baseline 

In [9]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
print("Mean house price:", round(y_mean,2))
print("Baseline MAE:", mean_absolute_error(y_train, y_pred_baseline))

Mean house price: 284032.22
Baseline MAE: 137146.33743033305


### Iterate 

In [10]:
# Instantiate
ohe = OneHotEncoder(use_cat_names=True)
# Fit
ohe.fit(X_train)
# Transform
XT_train = ohe.transform(X_train)
print(XT_train.shape)
XT_train.head()

(6713, 83)


Unnamed: 0,surface_covered_in_m2,lat,lon,neighborhood_Morón,neighborhood_Tres de Febrero,neighborhood_Ituzaingó,neighborhood_La Matanza,neighborhood_Tigre,neighborhood_Moreno,neighborhood_Lanús,...,neighborhood_Barrio Norte,neighborhood_Recoleta,neighborhood_Villa Soldati,neighborhood_Cañuelas,neighborhood_Villa Santa Rita,neighborhood_Pompeya,neighborhood_Monserrat,neighborhood_Parque Avellaneda,neighborhood_Constitución,neighborhood_Parque Centenario
0,220.0,-34.6497,-58.658073,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,135.0,-34.595709,-58.56695,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,140.0,-34.655209,-58.69732,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,230.0,-34.652173,-58.577867,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,200.0,-34.417629,-58.645755,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge(),
)

model.fit(X_train, y_train)
print(model.fit(X_train, y_train))

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['neighborhood'], use_cat_names=True)),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])


### Evaluate

In [12]:
y_pred_training = model.predict(X_train)
print("Training MAE:", mean_absolute_error(y_train, y_pred_training))

Training MAE: 79873.04528994317


In [13]:
X_test = pd.read_excel("data/test-features.xlsx")
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()

0     80216.944973
1    253278.084411
2     31906.806885
3    119468.581377
4    149269.217847
dtype: float64

### Communicate Results

In [14]:
def make_prediction(area, lat, lon, neighborhood):
    data = {
        "surface_covered_in_m2": area,
        "lat": lat,
        "lon": lon,
        "neighborhood": neighborhood
    }
    df = pd.DataFrame(data, index=[0])
    prediction = model.predict(df).round(2)[0]
    return f"Predicted apartment price: ${prediction}"

In [15]:
make_prediction(110, -34.60, -58.46, "Villa Crespo")

'Predicted apartment price: $265556.21'

In [16]:
# Interactive dashboard
interact(
    make_prediction,
    area=IntSlider(
        min=X_train["surface_covered_in_m2"].min(),
        max=X_train["surface_covered_in_m2"].max(),
        value=X_train["surface_covered_in_m2"].mean(),
    ),
    lat=FloatSlider(
        min=X_train["lat"].min(),
        max=X_train["lat"].max(),
        step=0.01,
        value=X_train["lat"].mean(),
    ),
    lon=FloatSlider(
        min=X_train["lon"].min(),
        max=X_train["lon"].max(),
        step=0.01,
        value=X_train["lon"].mean(),
    ),
    neighborhood=Dropdown(options=sorted(X_train["neighborhood"].unique())),
);

interactive(children=(IntSlider(value=192, description='area', max=400, min=70), FloatSlider(value=-34.3918867…