In [None]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]
    df[["lat","lon"]] = df["lat-lon"].str.split(",", expand = True).astype(float).info()
    df.drop(columns="lat-lon",inplace=True)
    
    return df

In [None]:
##Use your wrangle function to create a DataFrame frame1 from the CSV file data/buenos-aires-real-estate-1.csv.
frame1 = wrangle("data/buenos-aires-real-estate-1.csv")
print(frame1.info())
frame1.head()

In [None]:
###concatinating two dataframes
df = pd.concat([frame1,frame2], ignore_index=True) #ignore_index removes the existing indexes
print(df.info())
df.head()

In [None]:
## multiple linear regression
#have more than two features

In [None]:
## splitting

In [None]:
##Create the feature matrix named X_train. It should contain two features: ["lon", "lat"]
features=["lat","lon"]
X_train=df[features]
X_train

In [None]:
##Create the target vector named y_train, which you'll use to train your model. Your target should be "price_aprox_usd". Remember that, in most cases, your target vector should be one-dimensional.

In [None]:
target = "price_aprox_usd"
y_train = df[target]
y_train

In [None]:
##Calculate the mean of your target vector y_train and assign it to the variable y_mean
y_mean = y_train.mean()
y_mean

In [None]:
## Create a list named y_pred_baseline that contains the value of y_mean repeated so that it's the same length at y_train.
y_pred_baseline = [y_mean] * len(y_train)
y_pred_baseline[:5]

In [None]:
## Calculate the baseline mean absolute error for your predictions in y_pred_baseline as compared to the true targets in y_train
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

In [None]:
##we can fill in these missing values using information we get from the whole column — a process called imputation
##instatiating an imputer
imputer = SimpleImputer()

In [None]:
##Fit your transformer imputer to the feature matrix X.
imputer.fit(X_train)

In [None]:
# Check your work
check_is_fitted(imputer)

In [None]:
##Here's where transformers diverge from predictors. Instead of using a method like predict, we use the transform method. This is the step where the transformer fills in the missing values with the means it's calculated. 

In [None]:
## Use your imputer to transform the feature matrix X_train. Assign the transformed data to the variable XT_train
XT_train = imputer.transform(X_train)
pd.DataFrame(XT_train, columns=X_train.columns).info()

In [None]:
##Create a pipeline named model that contains a SimpleImputer transformer followed by a LinearRegression predictor.
model = make_pipeline(
    SimpleImputer(),
    LinearRegression()
)

In [None]:
##With our pipeline assembled, we use the fit method, which will train the transformer, transform the data, then pass the transformed data to the predictor for training, all in one step


In [None]:
## Fit your model to the data, X_train and y_train
model.fit(X_train,y_train)

In [None]:
##Using your model's predict method, create a list of predictions for the observations in your feature matrix X_train
y_pred_training = model.predict(X_train)

In [None]:
##Calculate the training mean absolute error for your predictions in y_pred_training as compared to the true targets in y_train
mae_training = mean_absolute_error(y_train,y_pred_training)
print("Training MAE:", round(mae_training, 2))

In [None]:
## import your test data buenos-aires-test-features.csv into a DataFrame and generate a Series of predictions using your model
X_test = pd.read_csv("data/buenos-aires-test-features.csv")[features]
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()

In [None]:
##represent our linear model in a 3D plot
# Create 3D scatter plot
fig = px.scatter_3d(
    df,
    x="lon",
    y="lat",
    z="price_aprox_usd",
    labels={"lon": "longitude", "lat": "latitude", "price_aprox_usd": "price"},
    width=600,
    height=500,
)

# Create x and y coordinates for model representation
x_plane = np.linspace(df["lon"].min(), df["lon"].max(), 10)
y_plane = np.linspace(df["lat"].min(), df["lat"].max(), 10)
xx, yy = np.meshgrid(x_plane, y_plane)

# Use model to predict z coordinates
z_plane = model.predict(pd.DataFrame({"lon": x_plane, "lat": y_plane}))
zz = np.tile(z_plane, (10, 1))

# Add plane to figure
fig.add_trace(go.Surface(x=xx, y=yy, z=zz))

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()