In [None]:
import warnings
from pathlib import Path

import catboost
import lightgbm as lgb
import mapie
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import requests
import xgboost
from IPython.display import clear_output, display
from pymongo import MongoClient
from pymongoarrow.api import find_pandas_all
from sklearn import (compose, dummy, ensemble, impute, linear_model, metrics,
                     model_selection, pipeline, preprocessing, svm, tree)
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

import creds

# Get data from MongoDB

In [None]:
cluster = MongoClient(creds.Creds.URI)

query = {"building_condition": "Good"}
df = find_pandas_all(cluster.test.BE_houses, None)
df

# Show choropleth data

In [None]:
BE_provinces = requests.get(
    "https://raw.githubusercontent.com/mathiasleroy/Belgium-Geographic-Data/master/dist/polygons/be-provinces-unk-WGS84.geo.json"
).json()

aggregate = (
    df.assign(list_price=lambda df: pd.to_numeric(df.price))
    .groupby("province")
    .agg(
        list_price_count=("price", "count"),
        list_price_mean=("price", "median"),
    )
    .reset_index()
)

fig = px.choropleth(
    aggregate,
    geojson=BE_provinces,
    locations="province",
    color="list_price_mean",
    featureidkey="properties.name",
    projection="mercator",
    color_continuous_scale="Magenta",
    labels={
        "list_price_mean": "Median Price",
        "list_price_count": "Number of Observations",
    },
    hover_data={"list_price_mean": ":.3s", "province": True, "list_price_count": True},
)

fig.update_geos(
    showcountries=True, showcoastlines=True, showland=True, fitbounds="locations"
)

# Add title and labels
fig.update_layout(
    title_text="Median House Prices by Province",
    autosize=False,
    width=800,
    height=600,
    geo=dict(showframe=False, showcoastlines=False, projection_type="mercator"),
)


fig.show()

In [None]:
df_no_null = df.dropna(subset="price")

y = np.log10(df_no_null["price"])
X = df_no_null.drop(
    columns=[i for i in df.select_dtypes("object").columns.tolist() + ["price", "_id"]]
)

In [None]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=45
)

In [None]:
catboost_train = catboost.Pool(
    X_train,
    y_train,
    # cat_features=X.select_dtypes(include='category').columns.tolist(),
)

catboost_valid = catboost.Pool(
    X_valid,
    y_valid,
    # cat_features=X.select_dtypes(include='category').columns.tolist(),
)

model = catboost.CatBoostRegressor(
    loss_function="RMSE",
)
model.fit(
    catboost_train,
    eval_set=[catboost_valid],
    early_stopping_rounds=100,
    verbose=False,
    use_best_model=True,
)

mapie_model = mapie.regression.MapieRegressor(model, method="plus")

# fit MAPIE model
mapie_model.fit(X_train, y_train)

# make predictions with prediction intervals
y_pred, y_pis = mapie_model.predict(X_valid, alpha=0.1)

In [None]:
# Create a DataFrame with y_valid and prediction intervals
conformal_df = 10 ** pd.DataFrame(
    {
        "y_valid": y_valid,
        "lower": y_pis[:, 0].flatten(),
        "upper": y_pis[:, 1].flatten(),
        "y_pred": y_pred,
    }
)

# Sort the DataFrame by y_valid
df_sorted = conformal_df.sort_values(by="y_valid")

# Plot data
plt.scatter(
    range(df_sorted.shape[0]), df_sorted["y_valid"], color="green", label="ground truth"
)
plt.scatter(
    range(df_sorted.shape[0]), df_sorted["y_pred"], color="red", label="predicted"
)
plt.fill_between(
    range(df_sorted.shape[0]),
    df_sorted["lower"],
    df_sorted["upper"],
    alpha=0.2,
    color="gray",
    label="Prediction Intervals",
)

plt.legend()

# Comparing different models

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    A transformer for selecting specific columns from a DataFrame.

    This class inherits from the BaseEstimator and TransformerMixin classes from sklearn.base.
    It overrides the fit and transform methods from the parent classes.

    Attributes:
        feature_names_in_ (list): The names of the features to select.
        n_features_in_ (int): The number of features to select.

    Methods:
        fit(X, y=None): Fit the transformer. Returns self.
        transform(X, y=None): Apply the transformation. Returns a DataFrame with selected features.
    """

    def __init__(self, feature_names_in_):
        """
        Constructs all the necessary attributes for the FeatureSelector object.

        Args:
            feature_names_in_ (list): The names of the features to select.
        """
        self.feature_names_in_ = feature_names_in_
        self.n_features_in_ = len(feature_names_in_)

    def fit(self, X, y=None):
        """
        Fit the transformer. This method doesn't do anything as no fitting is necessary.

        Args:
            X (DataFrame): The input data.
            y (array-like, optional): The target variable. Defaults to None.

        Returns:
            self: The instance itself.
        """
        return self

    def transform(self, X, y=None):
        """
        Apply the transformation. Selects the features from the input data.

        Args:
            X (DataFrame): The input data.
            y (array-like, optional): The target variable. Defaults to None.

        Returns:
            DataFrame: A DataFrame with only the selected features.
        """
        return X.loc[:, self.feature_names_in_].copy(deep=True)

In [None]:
processed_df = prepare_data_for_modeling(df)
y = processed_df["list_price"]
X = processed_df.drop(columns=["list_price"])

X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
    X, y, test_size=0.2
)

In [None]:
# Selecting columns by dtypes

numerical_columns = X_train.head().select_dtypes("number").columns.to_list()
categorical_columns = X_train.head().select_dtypes("category").columns.to_list()

In [None]:
# Prepare pipelines for corresponding columns:
numerical_pipeline = pipeline.Pipeline(
    steps=[
        ("num_selector", FeatureSelector(numerical_columns)),
        ("imputer", impute.SimpleImputer(strategy="median")),
        ("std_scaler", preprocessing.MinMaxScaler()),
    ]
)

categorical_pipeline = pipeline.Pipeline(
    steps=[
        ("cat_selector", FeatureSelector(categorical_columns)),
        ("imputer", impute.SimpleImputer(strategy="most_frequent")),
        (
            "onehot",
            preprocessing.OneHotEncoder(handle_unknown="ignore", sparse_output=True),
        ),
    ]
)

# Put all the pipelines inside a FeatureUnion:
data_preprocessing_pipeline = pipeline.FeatureUnion(
    n_jobs=-1,
    transformer_list=[
        ("numerical_pipeline", numerical_pipeline),
        ("categorical_pipeline", categorical_pipeline),
    ],
)

In [None]:
[
    linear_model.LinearRegression(),
    linear_model.SGDRegressor(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.RANSACRegressor(),
    linear_model.Lasso(),
    svm.SVR(),
    ensemble.GradientBoostingRegressor(),
    tree.DecisionTreeRegressor(),
    ensemble.RandomForestRegressor(),
    ensemble.ExtraTreesRegressor(),
    ensemble.AdaBoostRegressor(),
    catboost.CatBoostRegressor(silent=True),
    lgb.LGBMRegressor(verbose=-1),
    xgboost.XGBRegressor(verbosity=0),
    dummy.DummyClassifier(),
]

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter(action="ignore", category=FutureWarning)

    MLA = [
        linear_model.LinearRegression(),
        linear_model.SGDRegressor(),
        linear_model.PassiveAggressiveRegressor(),
        linear_model.RANSACRegressor(),
        linear_model.Lasso(),
        svm.SVR(),
        ensemble.GradientBoostingRegressor(),
        tree.DecisionTreeRegressor(),
        ensemble.RandomForestRegressor(),
        ensemble.ExtraTreesRegressor(),
        ensemble.AdaBoostRegressor(),
        catboost.CatBoostRegressor(silent=True),
        lgb.LGBMRegressor(verbose=-1),
        xgboost.XGBRegressor(verbosity=0),
        dummy.DummyClassifier(),
    ]

    # note: this is an alternative to train_test_split
    cv_split = model_selection.ShuffleSplit(
        n_splits=10, test_size=0.3, train_size=0.6, random_state=0
    )  # run model 10x with 60/30 split intentionally leaving out 10%

    # create table to compare MLA metrics
    MLA_columns = [
        "MLA Name",
        "MLA Parameters",
        "MLA Train RMSE Mean",
        "MLA Test RMSE Mean",
        "MLA Train R2 Mean",
        "MLA Test R2 Mean",
        "MLA Time",
    ]
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    # index through MLA and save performance to table
    row_index = 0
    for alg in tqdm(MLA):
        # set name and parameters
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, "MLA Name"] = MLA_name
        MLA_compare.loc[row_index, "MLA Parameters"] = str(alg.get_params())

        model_pipeline = pipeline.Pipeline(
            steps=[
                ("data_preprocessing_pipeline", data_preprocessing_pipeline),
                ("model", alg),
            ]
        )

        cv_results = model_selection.cross_validate(
            model_pipeline,
            X_train,
            y_train,
            cv=cv_split,
            scoring={
                "r2": "r2",
                "neg_root_mean_squared_error": "neg_root_mean_squared_error",
            },
            return_train_score=True,
        )

        MLA_compare.loc[row_index, "MLA Time"] = cv_results["fit_time"].mean()
        MLA_compare.loc[row_index, "MLA Train RMSE Mean"] = cv_results[
            "train_neg_root_mean_squared_error"
        ].mean()
        MLA_compare.loc[row_index, "MLA Test RMSE Mean"] = cv_results[
            "test_neg_root_mean_squared_error"
        ].mean()

        MLA_compare.loc[row_index, "MLA Train R2 Mean"] = cv_results["train_r2"].mean()
        MLA_compare.loc[row_index, "MLA Test R2 Mean"] = cv_results["test_r2"].mean()

        row_index += 1

        clear_output(wait=True)
        display(MLA_compare.sort_values(by=["MLA Test RMSE Mean"], ascending=False))