In [1]:
# Im port libraries
import pandas as pd
# Plot Data
import matplotlib.pyplot as plt
# Heatmap
import seaborn as sns
# Import Metrics - Mean Absolute Error
from sklearn.metrics import mean_absolute_error
# Find file names that have similar structure
from glob import glob
# Import Imputer aka Transfomer (fill in missing information)
from sklearn.impute import SimpleImputer
# Another Transformer for Catergory data
from category_encoders import OneHotEncoder
# Import Linear Regression model and Ridge model aka Predictors. Select just one!!!
from sklearn.linear_model import LinearRegression, Ridge
# Create pipeline package from Transfomer and Predictor
from sklearn.pipeline import Pipeline, make_pipeline
# Model Fit Check
from sklearn.utils.validation import check_is_fitted
# Interactive dashboard
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact

# Prepare Data

In [2]:
# Build your `wrangle` function
def wrangle(filepath):
    # Import file
    df = pd.read_csv(filepath)
    # Mask the state = {Mexico City}
    mask_state = df["place_with_parent_names"].str.contains("Distrito Federal")
    # Mask only apartment
    mask_apt = df["property_type"] == "apartment"
    # Mask the price < $100k
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_state & mask_apt& mask_price]
    # Remove outliers top and bottom 10%
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    
    df = df[mask_area]
    # Separate lat-lon
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns = ["lat-lon"], inplace=True)
    
    # Create a borough column
    df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    
    # Drop 50% above null columns
    df.drop(columns = ["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", "expenses"], inplace=True)
    
    # Drop low and high collinearity columns
    df.drop(columns = ["operation", "property_type", "currency", "properati_url"], inplace = True)
    
    # Drop leaky columns
    df.drop(columns = ["price", "price_aprox_local_currency", "price_per_m2"], inplace=True)
    
    # Drop multicollinearity
    df.drop(columns = ["place_with_parent_names"], inplace=True)
    return df

In [3]:
# Create the list of files
files = glob("data/mexico-city-real-estate-*.csv")
files

['data/mexico-city-real-estate-5.csv',
 'data/mexico-city-real-estate-4.csv',
 'data/mexico-city-real-estate-1.csv',
 'data/mexico-city-real-estate-3.csv',
 'data/mexico-city-real-estate-2.csv']

In [4]:
df = pd.concat([wrangle(file) for file in files], ignore_index=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5473 entries, 0 to 5472
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   price_aprox_usd        5473 non-null   float64
 1   surface_covered_in_m2  5473 non-null   float64
 2   lat                    5149 non-null   float64
 3   lon                    5149 non-null   float64
 4   borough                5473 non-null   object 
dtypes: float64(4), object(1)
memory usage: 213.9+ KB
None


In [5]:
# Split data into feature matrix `X_train` and target vector `y_train`.
features = ["surface_covered_in_m2","lat","lon","borough"]
target = "price_aprox_usd"
X_train = df[features]
y_train = df[target]

# Build model

In [6]:
# Build baseline model
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print("Mean apt price:", y_mean)
print("Baseline MAE:", baseline_mae)

Mean apt price: 54246.53149826428
Baseline MAE: 17239.9394758883


In [7]:
# Build Model
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)
# Fit model
model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['borough'], use_cat_names=True)),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])

In [8]:
# Evaluate
y_pred_training = model.predict(X_train)
print("Training MAE:", mean_absolute_error(y_train, y_pred_training))

Training MAE: 14943.165824063904


# Deploy a model

In [9]:
# Predict function
def make_prediction(area, lat, lon, borough):
    data = {
        "surface_covered_in_m2": area,
        "lat": lat,
        "lon": lon,
        "borough": borough
    }
    df = pd.DataFrame(data, index = [0])
    prediction = model.predict(df).round(2)[0]
    return f"Predicted apartment price: ${prediction}"

In [10]:
X_test = pd.read_csv("data/mexico-city-test-features.csv")[features]
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()

0    53538.366480
1    53171.988369
2    34263.884179
3    53488.425607
4    68738.924884
dtype: float64

In [15]:
intercept = model.named_steps["ridge"].intercept_
coefficients = model.named_steps["ridge"].coef_
print("coefficients len:", len(coefficients))
print(coefficients[:5])  # First five coefficients

coefficients len: 18
[  291.65415588   478.90137462 -2492.22181412 13778.18887983
 10319.42980445]


In [16]:
feature_names = model.named_steps["onehotencoder"].get_feature_names()
print("features len:", len(feature_names))
print(feature_names[:5])  # First five feature names

features len: 18
['surface_covered_in_m2', 'lat', 'lon', 'borough_Benito Juárez', 'borough_Tlalpan']


In [17]:
feat_imp = pd.Series(coefficients, index=feature_names)
feat_imp.head()

surface_covered_in_m2      291.654156
lat                        478.901375
lon                      -2492.221814
borough_Benito Juárez    13778.188880
borough_Tlalpan          10319.429804
dtype: float64

In [18]:
print(f"price = {intercept.round(2)}")
for f, c in feat_imp.items():
    print(f"+ ({round(c, 2)} * {f})")

price = -222998.23
+ (291.65 * surface_covered_in_m2)
+ (478.9 * lat)
+ (-2492.22 * lon)
+ (13778.19 * borough_Benito Juárez)
+ (10319.43 * borough_Tlalpan)
+ (-13349.02 * borough_Iztapalapa)
+ (-6637.43 * borough_Gustavo A. Madero)
+ (-5609.92 * borough_Venustiano Carranza)
+ (405.4 * borough_Iztacalco)
+ (3737.56 * borough_Coyoacán)
+ (-350.53 * borough_Cuauhtémoc)
+ (1977.31 * borough_Miguel Hidalgo)
+ (2459.29 * borough_Azcapotzalco)
+ (-14166.87 * borough_Tláhuac)
+ (3275.12 * borough_Álvaro Obregón)
+ (-5925.67 * borough_La Magdalena Contreras)
+ (929.86 * borough_Xochimilco)
+ (9157.27 * borough_Cuajimalpa de Morelos)
