In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
from glob import glob

import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
# Interactive dashboard
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)


In the final lesson for this project, we're going to try to use all the features in our dataset to improve our model. This means that we'll have to do a more careful cleaning of the dataset and consider some of the finer points of linear models.

# Prepare Data

Task 2.4.1: Use glob to create a list that contains the filenames for all the Buenos Aires real estate CSV files in the data directory. Assign this list to the variable name files

In [None]:
files = glob('data/buenos-aires-real-estate-*.csv')
files
#output
['data/buenos-aires-real-estate-2.csv',
 'data/buenos-aires-real-estate-1.csv',
 'data/buenos-aires-real-estate-4.csv',
 'data/buenos-aires-real-estate-5.csv',
 'data/buenos-aires-real-estate-3.csv']

Task 2.4.2: Use your wrangle function in a list comprehension to create a list named frames. The list should contain the cleaned DataFrames for the filenames your collected in files.

In [None]:
frames = [wrangle(file) for file in files]

In [None]:
# Check your work
assert len(frames) == 5, f"`frames` should contain 5 items, not {len(frames)}"
assert all(
    [isinstance(frame, pd.DataFrame) for frame in frames]
), "The items in `frames` should all be DataFrames."

Last step: Combine the DataFrames in frames into a single df.

Task 2.4.3: Use pd.concat to concatenate it items in frames into a single DataFrame df. Make sure you set the ignore_index argument to True.

In [None]:
df = pd.concat(frames,ignore_index=True)
print(df.info())
df.head()

In [None]:
# Check your work
assert len(df) == 6582, f"`df` has the wrong number of rows: {len(df)}"
assert df.shape[1] <= 17, f"`df` has too many columns: {df.shape[1]}"

# Explore

In [None]:
df.isnull().sum()/ len(df)
  # remove columns with alot of null values
    df.drop(columns=['floor','expenses'],inplace=True)

The next thing we need to look out for are categorical columns with low or high cardinality. If there's only one category in a column, it won't provide any unique information to our model. At the other extreme, columns where nearly every row has its own category won't help our model in identifying useful trends in the data.

Task 2.4.5: Calculate the number of unique values for each non-numeric feature in df.

In [None]:
df.select_dtypes("object").head()
df.select_dtypes("object").nunique
#output
operation           1
property_type       1
currency            2
properati_url    6582
neighborhood       57
dtype: int64
# the attributes with values 1 are the low cardinality feature that u 
# have to remove from ur data 
df.drop(columns=['operation','property_type','currency','properati_url'],inplace=True)
add this function to the wrangle above function

It's also important for us to drop any columns that would constitute leakage, that is, features that were created using our target or that would give our model information that it won't have access to when it's deployed.

In [None]:
sorted(df.columns)
#output
['lat',
 'lon',
 'neighborhood',
 #drop'price',
 #drop'price_aprox_local_currency',
 'price_aprox_usd',
 #drop'price_per_m2',
 #drop 'price_usd_per_m2',
 'rooms',
 'surface_covered_in_m2',
 'surface_total_in_m2']

In [None]:
# Check your work
assert len(df) == 6582, f"`df` has the wrong number of rows: {len(df)}"
assert df.shape[1] <= 7, f"`df` has too many columns: {df.shape[1]}"

Finally, the last issue we need to keep an eye out for is multicollinearity, that is, features in our feature matrix that are highly correlated with each other. A good way to detect this is to use a heatmap. Let's make one!

In [None]:
corr=df.select_dtypes("number").drop(columns="price_aprox_usd").corr()
sns.heatmap(corr)


Task 2.4.9: Modify your wrangle function to remove columns so that there are no strongly correlated features in your feature matrix

# Split Data

Task 2.4.10: Create your feature matrix X_train and target vector y_train. Your target is "price_aprox_usd". Your features should be all the columns that remain in the DataFrame you cleaned above.

In [None]:
target = "price_aprox_usd"
y_train=df[target]
features=["surface_covered_in_m2","lat","lon","neighborhood"]
X_train=df[features]

In [None]:
# Check your work
assert X_train.shape == (6582, 4), f"`X_train` is the wrong size: {X_train.shape}."
assert y_train.shape == (6582,), f"`y_train` is the wrong size: {y_train.shape}."

# Build Model

Task 2.4.11: Calculate the baseline mean absolute error for your model.

In [None]:
y_mean=y_train.mean()
y_pred_baseline=[y_mean]*len(y_train)
print("Mean apt price:", round(y_mean,2))

print("Baseline MAE:", mean_absolute_error(y_train,y_pred_baseline))

# Iterate

Task 2.4.12: Create a pipeline named model that contains a OneHotEncoder, SimpleImputer, and Ridge predictor.

In [None]:
model = make_pipeline(
     OneHotEncoder(),
     SimpleImputer()
    ,Ridge()
)
model.fit(X_train,y_train)

In [None]:
# Check your work
check_is_fitted(model[-1])

# Evaluate

Task 2.4.13: Calculate the training mean absolute error for your predictions as compared to the true targets in y_train.

In [None]:
y_pred_training=model.predict(X_train)
print("Training MAE:", mean_absolute_error(y_train,y_pred_training))

Task 2.4.15: Create a function make_prediction that takes four arguments (area, lat, lon, and neighborhood) and returns your model's prediction for an apartment price.

In [None]:
def make_prediction(area, lat, lon, neighborhood):
    data={
        "surface_covered_in_m2":area,
        "lat": lat,
        "lon":lon,
        "neighborhood": neighborhood
    }
    df=pd.DataFrame(data,index=[0])
    prediction = model.predict(df).round(2)[0]
    return f"Predicted apartment price: ${prediction}"

In [None]:
make_prediction(110, -34.60, -58.46, "Villa Crespo")

Task 2.4.16: Add your make_prediction to the interact widget below, run the cell, and then adjust the widget to see how predicted apartment price changes.

In [None]:
interact(
    make_prediction,
    area=IntSlider(
        min=X_train["surface_covered_in_m2"].min(),
        max=X_train["surface_covered_in_m2"].max(),
        value=X_train["surface_covered_in_m2"].mean(),
    ),
    lat=FloatSlider(
        min=X_train["lat"].min(),
        max=X_train["lat"].max(),
        step=0.01,
        value=X_train["lat"].mean(),
    ),
    lon=FloatSlider(
        min=X_train["lon"].min(),
        max=X_train["lon"].max(),
        step=0.01,
        value=X_train["lon"].mean(),
    ),
    neighborhood=Dropdown(options=sorted(X_train["neighborhood"].unique())),
);