In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from category_encoders import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)


In [None]:
In the last lesson, we created a model that used location — 
represented by latitude and longitude — to predict price. 
In this lesson, we're going to use a different representation for 
location: neighborhood.

Task 2.3.1: Use glob to create a list that contains the filenames for all the Buenos Aires real estate CSV files in the data directory. Assign this list to the variable name files.

In [None]:
files = glob("data/buenos-aires-real-estate-*.csv")
files

In [None]:
# Check your work
assert len(files) == 5, f"`files` should contain 5 items, not {len(files)}"

Task 2.3.2: Use your wrangle function in a for loop to create a list named frames. The list should the cleaned DataFrames created from the CSV filenames your collected in files.

In [None]:
frames = []
for file in files:
    df=wrangle(file)
    frames.append(df)

len(frames)

In [None]:
# Check your work
assert len(frames) == 5, f"`frames` should contain 5 items, not {len(frames)}"
assert all(
    [isinstance(frame, pd.DataFrame) for frame in frames]
), "The items in `frames` should all be DataFrames."

Task 2.3.3: Use pd.concat to concatenate the items in frames into a single DataFrame df. Make sure you set the ignore_index argument to True

In [None]:
df = pd.concat(frames,ignore_index=True)
df.head()
df.shape
(6582, 17)

In [None]:
# Check your work
assert len(df) == 6582, f"`df` is the wrong size: {len(df)}."

# Explore

Looking through the output from the df.head() call above, there's a little bit more cleaning we need to do before we can work with the neighborhood information in this dataset. The good news is that, because we're using a wrangle function, we only need to change the function to re-clean all of our CSV files. This is why functions are so useful.

Task 2.3.4: Modify your wrangle function to create a new feature "neighborhood". You can find the neighborhood for each property in the "place_with_parent_names" column. For example, a property with the place name "|Argentina|Capital Federal|Palermo|" is located in the neighborhood is "Palermo". Also, your function should drop the "place_with_parent_names" column.

Be sure to rerun all the cells above before you continue.

In [None]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)
    # Extract neighborhood
    df['neighborhood']=df['place_with_parent_names'].str.split('|',expand=True)[3]
    df.drop(columns='place_with_parent_names',inplace=True)

    

    return df

# Split

In [None]:
target = "price_aprox_usd"
features = ["neighborhood"]
y_train = df[target]
X_train = df[features]

In [None]:
# Check your work
assert X_train.shape == (6582, 1), f"`X_train` is the wrong size: {X_train.shape}."
assert y_train.shape == (6582,), f"`y_train` is the wrong size: {y_train.shape}."

# Build Model

Task 2.3.6: Calculate the baseline mean absolute error for your model.

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean]* len(y_train)
print("Mean apt price:", y_mean)

print("Baseline MAE:", mean_absolute_error(y_train,y_pred_baseline))

# Iterate

Task 2.3.7: First, instantiate a OneHotEncoder named ohe. Make sure to set the use_cat_names argument to True. Next, fit your transformer to the feature matrix X_train. Finally, use your encoder to transform the feature matrix X_train, and assign the transformed data to the variable XT_train.

In [None]:
ohe = OneHotEncoder(use_cat_names=True)
ohe.fit(X_train)
XT_train =ohe.transform(X_train)
print(XT_train.shape)
XT_train.head()

Now that we have an idea for how the OneHotEncoder works, let's bring it into our pipeline.

Task 2.3.8: Create a pipeline named model that contains a OneHotEncoder transformer and a LinearRegression predictor. Then fit your model to the training data.

In [None]:
model = make_pipeline(
     OneHotEncoder(use_cat_names=True)
    ,LinearRegression()
)
model.fit(X_train,y_train)

In [None]:
# Check your work
check_is_fitted(model[-1])

# Evaluate

Task 2.3.9: First, create a list of predictions for the observations in your feature matrix X_train. Name this list y_pred_training. Then calculate the training mean absolute error for your predictions in y_pred_training as compared to the true targets in y_train.

In [None]:
y_pred_training = model.predict(X_train)
mae_training = mean_absolute_error(y_train,y_pred_training)
print("Training MAE:", round(mae_training, 2))

In [None]:
Task 2.3.11: Extract the intercept and coefficients for your model.

In [None]:
intercept = model.named_steps['linearregression'].intercept_
coefficients = model.named_steps['linearregression'].coef_
print("coefficients len:", len(coefficients))
print(coefficients[:5])  # First five coefficients

In [None]:
# Check your work
assert isinstance(
    intercept, float
), f"`intercept` should be a `float`, not {type(intercept)}."
assert isinstance(
    coefficients, np.ndarray
), f"`coefficients` should be a `float`, not {type(coefficients)}."
assert coefficients.shape == (
    57,
), f"`coefficients` is wrong shape: {coefficients.shape}."

Task 2.3.12: Extract the feature names of your encoded data from the OneHotEncoder in your model.

In [None]:
feature_names = model.named_steps['onehotencoder'].get_feature_names()
print("features len:", len(feature_names))
print(feature_names[:5])  # First five feature names

In [None]:
# Check your work
assert isinstance(
    feature_names, list
), f"`features` should be a `list`, not {type(features)}."
assert len(feature_names) == len(
    coefficients
), "You should have the same number of features and coefficients."

Task 2.3.13: Create a pandas Series named feat_imp where the index is your features and the values are your coefficients

In [None]:
feat_imp = pd.Series(coefficients,index=feature_names)
feat_imp.head()

In [None]:
#neighborhood_Recoleta            6.157563e+17
#neighborhood_Monserrat           6.157563e+17
#neighborhood_Belgrano            6.157563e+17
#neighborhood_Villa del Parque    6.157563e+17
#neighborhood_Villa Pueyrredón    6.157563e+17
#dtype: float64

In [None]:
# Check your work
assert isinstance(
    feat_imp, pd.Series
), f"`feat_imp` should be a `float`, not {type(feat_imp)}."
assert feat_imp.shape == (57,), f"`feat_imp` is wrong shape: {feat_imp.shape}."
assert all(
    a == b for a, b in zip(sorted(feature_names), sorted(feat_imp.index))
), "The index of `feat_imp` should be identical to `features`."

Task 2.3.14: Run the cell below to print the equation that your model has determined for predicting apartment price based on longitude and latitude.

In [None]:
print(f"price = {intercept.round(2)}")
for f, c in feat_imp.items():
    print(f"+ ({round(c, 2)} * {f})")

price = -6.157563196756461e+17
+ (6.157563196758367e+17 * neighborhood_Recoleta)
+ (6.157563196757453e+17 * neighborhood_Monserrat)
+ (6.157563196758118e+17 * neighborhood_Belgrano)
+ (6.157563196757504e+17 * neighborhood_Villa del Parque)
+ (6.157563196757568e+17 * neighborhood_Villa Pueyrredón)
+ (6.15756319675768e+17 * neighborhood_Almagro)
+ (6.15756319675812e+17 * neighborhood_Palermo)
+ (6.157563196757443e+17 * neighborhood_)
+ (6.157563196757554e+17 * neighborhood_Tribunales)
+ (6.157563196757522e+17 * neighborhood_Balvanera)
+ (6.157563196758194e+17 * neighborhood_Barrio Norte)
+ (6.157563196757608e+17 * neighborhood_Once)
+ (6.157563196757691e+17 * neighborhood_San Telmo)
+ (6.157563196757146e+17 * neighborhood_Villa Lugano)
+ (6.157563196757763e+17 * neighborhood_Coghlan)
+ (6.157563196757583e+17 * neighborhood_Barracas)
+ (6.157563196757773e+17 * neighborhood_Villa Urquiza)
+ (6.15756319675769e+17 * neighborhood_Abasto)
+ (6.157563196757709e+17 * neighborhood_Villa Crespo)
+ (6.157563196757437e+17 * neighborhood_Villa Santa Rita)
+ (6.157563196758024e+17 * neighborhood_Colegiales)
+ (6.157563196757573e+17 * neighborhood_Paternal)
+ (6.157563196757725e+17 * neighborhood_Caballito)
+ (6.157563196757573e+17 * neighborhood_Parque Chacabuco)
+ (6.157563196757921e+17 * neighborhood_Retiro)
+ (6.157563196757686e+17 * neighborhood_Villa Devoto)
+ (6.157563196757644e+17 * neighborhood_Villa Luro)
+ (6.157563196757533e+17 * neighborhood_San Nicolás)
+ (6.157563196757787e+17 * neighborhood_Saavedra)
+ (6.157563196757559e+17 * neighborhood_Flores)
+ (6.157563196757564e+17 * neighborhood_Centro / Microcentro)
+ (6.157563196757508e+17 * neighborhood_Liniers)
+ (6.15756319675753e+17 * neighborhood_San Cristobal)
+ (6.157563196757359e+17 * neighborhood_Boca)
+ (6.157563196757568e+17 * neighborhood_Congreso)
+ (6.157563196757583e+17 * neighborhood_Parque Centenario)
+ (6.157563196757299e+17 * neighborhood_Parque Chas)
+ (6.157563196758076e+17 * neighborhood_Nuñez)
+ (6.157563196757482e+17 * neighborhood_Parque Patricios)
+ (6.157563196757573e+17 * neighborhood_Boedo)
+ (6.157563196757496e+17 * neighborhood_Floresta)
+ (6.157563196757427e+17 * neighborhood_Mataderos)
+ (6.157563196758956e+17 * neighborhood_Puerto Madero)
+ (6.157563196757732e+17 * neighborhood_Villa General Mitre)
+ (6.157563196757637e+17 * neighborhood_Agronomía)
+ (6.15756319675753e+17 * neighborhood_Villa Ortuzar)
+ (6.157563196757614e+17 * neighborhood_Chacarita)
+ (6.157563196757316e+17 * neighborhood_Velez Sarsfield)
+ (6.157563196757606e+17 * neighborhood_Monte Castro)
+ (6.157563196758394e+17 * neighborhood_Las Cañitas)
+ (6.157563196757217e+17 * neighborhood_Constitución)
+ (6.157563196757329e+17 * neighborhood_Parque Avellaneda)
+ (6.15756319675691e+17 * neighborhood_Villa Soldati)
+ (6.157563196757588e+17 * neighborhood_Versalles)
+ (6.157563196757551e+17 * neighborhood_Villa Real)
+ (6.157563196757124e+17 * neighborhood_Pompeya)
+ (6.157563196757211e+17 * neighborhood_Catalinas)

In [None]:
what happening above is called curse of dimensionality
to solved use Ridge() model

Task 2.3.15: Scroll up, change the predictor in your model to Ridge, and retrain it. Then evaluate the model's training and test performance. Do you still have an overfitting problem? If not, extract the intercept and coefficients again (you'll need to change your code a little bit) and regenerate the model's equation. Does it look different than before?

In [None]:
model = make_pipeline(
     OneHotEncoder(use_cat_names=True)
    ,Ridge()
)
model.fit(X_train,y_train)

Task 2.3.16: Create a horizontal bar chart that shows the top 15 coefficients for your model, based on their absolute value.

In [None]:
feat_imp.sort_values(key=abs).tail(15).plot(kind="barh")
plt.xlabel("Importance [USD]")
plt.ylabel("Feature")
plt.title("Feature Importance for Apartment Price")