In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# This lesson is part of the DS Lab core curriculum. For that reason, this notebook can only be used on your WQU virtual machine. That's why i didn't upload the dataset but the task followed by its answer down

In [1]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted
warnings.simplefilter(action="ignore", category=FutureWarning)

In this lesson, we're going to build on the work we did in the previous lesson. We're going to create a more complex wrangle function, use it to clean more data, and build a model that considers more features when predicting apartment price.

# Prepare Data

In [None]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

     #split the lat-lon column 
    df[['lat','lon']]=df['lat-lon'].str.split(',',expand=True).astype(float)
    df.drop(columns='lat-lon',inplace=True)

    return df

Task 2.2.1: Use your wrangle function to create a DataFrame frame1 from the CSV file data/buenos-aires-real-estate-1.csv.

In [None]:
frame1 = wrangle('data/buenos-aires-real-estate-1.csv')
print(frame1.info())
frame1.head()

In [None]:
# Check your work
assert (
    frame1.shape[0] == 1343
), f"`frame1` should have 1343 rows, not {frame1.shape[0]}."
assert frame1.shape[1] == 17, f"`frame1` should have 17 columns, not {frame1.shape[1]}.

For our model, we're going to consider apartment location, specifically, latitude and longitude. Looking at the output from frame1.info(), we can see that the location information is in a single column where the data type is object (pandas term for str in this case). In order to build our model, we need latitude and longitude to each be in their own column where the data type is float.

Task 2.2.2: Add to the wrangle function below so that, in the DataFrame it returns, the "lat-lon" column is replaced by separate "lat" and "lon" columns. Don't forget to also drop the "lat-lon" column. Be sure to rerun all the cells above before you continue.

In [None]:
 #split the lat-lon column 
    df[['lat','lon']]=df['lat-lon'].str.split(',',expand=True).astype(float)
    df.drop(columns='lat-lon',inplace=True)

Task 2.2.3: Use you revised wrangle function create a DataFrames frame2 from the file data/buenos-aires-real-estate-2.csv

In [None]:
frame2 = wrangle('data/buenos-aires-real-estate-2.csv')

In [None]:
# Check your work
assert (
    frame2.shape[0] == 1315
), f"`frame1` should have 1315 rows, not {frame2.shape[0]}."
assert frame2.shape[1] == 17, f"`frame1` should have 17 columns, not {frame2.shape[1]}."

Task 2.2.4: Use pd.concat to concatenate frame1 and frame2 into a new DataFrame df. Make sure you set the ignore_index argument to True.

In [None]:
df = pd.concat([frame1,frame2],ignore_index=True)
print(df.info())
df.head()

In [None]:
# Check your work
assert df.shape == (2658, 17), f"`df` is the wrong size: {df.shape}"

# Explore

In the last lesson, we built a simple linear model that predicted apartment price based on one feature, "surface_covered_in_m2". In this lesson, we're building a multiple linear regression model that predicts price based on two features, "lon" and "lat". This means that our data visualizations now have to communicate three pieces of information: Longitude, latitude, and price. How can we represent these three attributes on a two-dimensional screen?

One option is to incorporate color into our scatter plot. For example, in the Mapbox scatter plot below, the location of each point represents latitude and longitude, and color represents price.

Task 2.2.5: Complete the code below to create a Mapbox scatter plot that shows the location of the apartments in df

In [None]:
fig = px.scatter_mapbox(
    df,  # Our DataFrame
    lat='lat',
    lon='lon',
    width=600,  # Width of map
    height=600,  # Height of map
    color="price_aprox_usd",
    hover_data=["price_aprox_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

Another option is to add a third dimension to our scatter plot. We can plot longitude on the x-axis and latitude on the y-axis (like we do in the map above), and then add a z-axis with price.

Task 2.2.6: Complete the code below to create a 3D scatter plot, with "lon" on the x-axis, "lat" on the y-axis, and "price_aprox_usd" on the z-axis.

In [None]:
# Create 3D scatter plot
fig = px.scatter_3d(
    df,
    x="lon",
    y="lat",
    z="price_aprox_usd",
    labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
    width=600,
    height=500,
)

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()

# Split

Even though we're building a different model, the steps we follow will be the same. Let's separate our features (latitude and longitude) from our target (price).

Task 2.2.7: Create the feature matrix named X_train. It should contain two features: ["lon", "lat"]

In [None]:
features = ["lon", "lat"]
X_train = df[features]
X_train.shape

Task 2.2.8: Create the target vector named y_train, which you'll use to train your model. Your target should be "price_aprox_usd". Remember that, in most cases, your target vector should be one-dimensional.

In [None]:
target = "price_aprox_usd"
y_train = df[target]
y_train.shape

# Build Model

****Baseline****

Again, we need to set a baseline so we can evaluate our model's performance. You'll notice that the value of y_mean is not exactly the same as it was in the previous lesson. That's because we've added more observations to our training data.

Task 2.2.9: Calculate the mean of your target vector y_train and assign it to the variable y_mean.

In [None]:
y_mean = y_train.mean()
y_mean


Task 2.2.10: Create a list named y_pred_baseline that contains the value of y_mean repeated so that it's the same length at y_train.

In [None]:
y_pred_baseline = [y_mean]*len(y_train)
y_pred_baseline[:5]

Task 2.2.11: Calculate the baseline mean absolute error for your predictions in y_pred_baseline as compared to the true targets in y_train.

In [None]:
mae_baseline = mean_absolute_error(y_train,y_pred_baseline)

print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

# Iterate

Take a moment to scroll up to the output for df.info() and look at the values in the "Non-Null Count" column. Because of the math it uses, a linear regression model can't handle observations where there are missing values. Do you see any columns where this will be a problem?

In the last project, we simply dropped rows that contained NaN values, but this isn't ideal. Models generally perform better when they have more data to train with, so every row is precious. Instead, we can fill in these missing values using information we get from the whole column — a process called imputation. There are many different strategies for imputing missing values, and one of the most common is filling in the missing values with the mean of the column.

In addition to predictors like LinearRegression, scikit-learn also has transformers that help us deal with issues like missing values. Let's see how one works, and then we'll add it to our model.

Task 2.2.12: Instantiate a SimpleImputer named imputer.

In [None]:
imputer = SimpleImputer ()

In [None]:
# Check your work
assert isinstance(imputer, SimpleImputer)

Just like a predictor, a transformer has a fit method. In the case of our SimpleImputer, this is the step where it calculates the mean values for each numerical column.

Task 2.2.13: Fit your transformer imputer to the feature matrix X.

In [None]:
imputer.fit(X_train)

In [None]:
# Check your work
check_is_fitted(imputer)

Task 2.2.14: Use your imputer to transform the feature matrix X_train. Assign the transformed data to the variable XT_train.

In [None]:
XT_train = imputer.transform(X_train)
pd.DataFrame(XT_train, columns=X_train.columns).info()

In [None]:
# Check your work
assert XT_train.shape == (2658, 2), f"`XT_train` is the wrong shape: {XT_train.shape}"
assert (
    np.isnan(XT_train).sum() == 0
), "Your feature matrix still has `NaN` values. Did you forget to transform is using `imputer`?"

Okay! Our data is free of missing values, and we have a good sense for how predictors work in scikit-learn. However, the truth is you'll rarely do data transformations this way. Why? A model may require multiple transformers, and doing all those transformations one-by-one is slow and likely to lead to errors. 🤦‍♀️ Instead, we can combine our transformer and predictor into a single object called a pipeline

Task 2.2.15: Create a pipeline named model that contains a SimpleImputer transformer followed by a LinearRegression predictor.

In [None]:
model = make_pipeline(
     SimpleImputer(),
     LinearRegression()
     

)

In [None]:
assert isinstance(model, Pipeline), "Did you instantiate your model?"

Task 2.2.16: Fit your model to the data, X_train and y_train.

In [None]:
model.fit(X_train,y_train)

In [None]:
# Check your work
check_is_fitted(model["linearregression"])

# Evalute

As always, we'll start by evaluating our model's performance on the training data

Task 2.2.17: Using your model's predict method, create a list of predictions for the observations in your feature matrix X_train. Name this list y_pred_training.

In [None]:
y_pred_training = model.predict(X_train)

In [None]:
# Check your work
assert y_pred_training.shape == (2658,)

Task 2.2.18: Calculate the training mean absolute error for your predictions in y_pred_training as compared to the true targets in y_train.

In [None]:
mae_training = mean_absolute_error(y_train,y_pred_training)
print("Training MAE:", round(mae_training, 2))

Training MAE: 42962.72
It looks like our model performs a little better than the baseline. This suggests that latitude and longitude aren't as strong predictors of price as size is.

Now let's check our test performance. Remember, once we test our model, there's no more iteration allowed.

# Communicate Results

Let's take a look at the equation our model has come up with for predicting price based on latitude and longitude. We'll need to expand on our formula to account for both features.

Equation: y = beta 0 + beta 1 * x

In [None]:
Task 2.2.20: Extract the intercept and coefficients for your model.

In [None]:
intercept = model.named_steps['linearregression'].intercept_.round()
coefficients = model.named_steps['linearregression'].coef_.round()
intercept , coefficients

Task 2.2.21: Complete the code below and run the cell to print the equation that your model has determined for predicting apartment price based on latitude and longitude

In [None]:
print(
    
    f"price = {intercept } + ({coefficients[0]} * longitude) + ({coefficients[1]} * latitude)"
)
# price = 38113587.0 + (196709.0 * longitude) + (765467.0 * latitude)

Task 2.2.22: Complete the code below to create a 3D scatter plot, with "lon" on the x-axis, "lat" on the y-axis, and "price_aprox_usd" on the z-axis.

In [None]:
# Create 3D scatter plot
fig = px.scatter_3d(
    df,
    x="lon",
    y="lat",
    z="price_aprox_usd",
    labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
    width=600,
    height=500,
)

# Create x and y coordinates for model representation
x_plane = np.linspace(df["lon"].min(), df["lon"].max(), 10)
y_plane = np.linspace(df["lat"].min(), df["lat"].max(), 10)
xx, yy = np.meshgrid(x_plane, y_plane)

# Use model to predict z coordinates
z_plane = model.predict(pd.DataFrame({"lon": x_plane, "lat": y_plane}))
zz = np.tile(z_plane, (10, 1))

# Add plane to figure
fig.add_trace(go.Surface(x=xx, y=yy, z=zz))

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()