<font size="+3"><strong>Hello Guys!</strong></font><br><font size="+2.5"><strong>Welcom to My Jupyter Lab</strong></font><br>
## Go through my PREDICTION MODEL for "Housing in Buenos Aires"

# Step 1: Import the Essential Packages

In [16]:
import warnings
from glob import glob
import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  # noqa F401
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)

# Step 2: Preparing Data
## We are creating a function,
Because it will be helpful when we are uploading multiple datasets.
## Data Wrangling with certain Requirements.
## Requirements:
1. Property should be in Capital Federal.
2. Property should be an Apartment.
3. Property Price should be less than "400000".
4. Remove the Outliers
5. Split "lat-lon" column int lat & lon.
6. Drop Nan values.
7. Remove Duplicate columns.

In [17]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)

    # Subset data: Apartments in "Capital Federal", less than 400,000
    mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 400_000
    df = df[mask_ba & mask_apt & mask_price]

    # Subset data: Remove outliers for "surface_covered_in_m2"
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    mask_area = df["surface_covered_in_m2"].between(low, high)
    df = df[mask_area]

    # Split "lat-lon" column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Get place name
    df["neighborhood"] = df["place_with_parent_names"].str.split("|", expand=True)[3]
    df.drop(columns="place_with_parent_names", inplace=True)
    df.drop(columns=["floor","expenses"] , inplace=True)
    df.drop(columns=["operation","property_type","currency","properati_url"] , inplace=True)
    df.drop(columns=[
        'price',
        'price_aprox_local_currency',
        'price_per_m2',
        'price_usd_per_m2',
        
    ],inplace=True)
    df.drop(columns=["surface_total_in_m2","rooms"],inplace=True)
        
    return df

# Step 3: Collecting Data

In [None]:
files = glob("Desktop/Data science/buenos-aires-real-estate-*.csv")
files

# Step 4: Uploading Data

In [19]:
frames = []
for file in files:
    df = wrangle(file)
    frames.append(df)

# Step 5: Concatenating Datasets

In [None]:
df = pd.concat(frames)
print(df.info())
df.head()

# Step 6: Plot a Heat map for visualisation.

In [None]:
corr = df.select_dtypes("number").drop(columns="price_aprox_usd").corr()
sns.heatmap(corr)

# Step 7: Training data.

In [22]:
target = "price_aprox_usd"
feature = ["surface_covered_in_m2","lat","lon","neighborhood"]
y_train = df[target]
X_train = df[feature]

# Step 8: Calculating the baseline mean absolute error for our model.

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
print("Mean apt price:", round(y_mean,2))
mae = mean_absolute_error(y_train, y_pred_baseline)
print("Baseline MAE:", mae )

# Step 9: Creating a pipeline that contains a OneHotEncoder, SimpleImputer, and Ridge predictor.

In [None]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)
model.fit(X_train,y_train)

# Step 10: Calculating the training mean absolute error.

In [None]:
y_pred_training = model.predict(X_train)
print("Training MAE:", mean_absolute_error(y_train, y_pred_training))

# Step 11: Create a function Make Prediction.

In [26]:
def make_prediction(area, lat, lon, neighborhood):
    data = {
        "surface_covered_in_m2" : area,
        "lat" : lat,
        "lon" : lon,
        "neighborhood" : neighborhood
    }
    df = pd.DataFrame(data,index=[0])
    prediction = model.predict(df).round(2)[0]
    return f"Predicted apartment price: ${prediction}"

# Step 12: Example prediction.

In [None]:
make_prediction(110, -34.60, -58.46, "Villa Crespo")

# Step 13: Creacting and interactive widget for better user interaction.

In [28]:
interact(
    make_prediction,
    area=IntSlider(
        min=X_train["surface_covered_in_m2"].min(),
        max=X_train["surface_covered_in_m2"].max(),
        value=X_train["surface_covered_in_m2"].mean(),
    ),
    lat=FloatSlider(
        min=X_train["lat"].min(),
        max=X_train["lat"].max(),
        step=0.01,
        value=X_train["lat"].mean(),
    ),
    lon=FloatSlider(
        min=X_train["lon"].min(),
        max=X_train["lon"].max(),
        step=0.01,
        value=X_train["lon"].mean(),
    ),
    neighborhood=Dropdown(options=sorted(X_train["neighborhood"].unique())),
);

interactive(children=(IntSlider(value=53, description='area', min=30), FloatSlider(value=-34.599066860101324, …