In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from src.evaluation.evaluation import calculate_metrics
pd.options.display.float_format = '{:.2f}'.format
import numpy as np

In [2]:
df_train = pd.read_csv(r'output_data/df_train_util.csv')
df_test = pd.read_csv(r'output_data/df_test_util.csv')

In [41]:
# Call the calculate_metrics function
metrics_df, df_test_with_metrics = calculate_metrics(df_test['precio'], df_test['precio_mean_barrio'], df_test)
metrics_df.transpose().to_csv(r'output_data\evaluation.csv')

In [3]:
from typing import Any, List
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def get_pipeline(
    base_model: Any,
    impute: bool,
    scale: bool,
    encode: bool,
    impute_model_cat: Any = SimpleImputer(strategy="most_frequent"),
    impute_model_num: Any = SimpleImputer(strategy="median"),
    scale_model: Any = StandardScaler(),
    encode_model: Any = OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    cat_features: List[str] = None,
    num_features: List[str] = None,
) -> Pipeline:
    """
    Build a sklearn pipeline for preprocessing and modeling.
    :param base_model: Base model to be added at the end of the pipeline.
    :param impute: Indicates whether missing values imputation should be performed.
    :param scale: Indicates whether numerical feature scaling should be performed.
    :param encode: Indicates whether categorical feature encoding should be performed.
    :param impute_model_cat: Imputation model for categorical features.
      Default is SimpleImputer with strategy "most_frequent".
    :param impute_model_num: Imputation model for numerical features.
      Default is SimpleImputer with strategy "median".
    :param scale_model: Scaling model for numerical features.
      Default is StandardScaler.
    :param encode_model: Encoding model for categorical features.
      Default is OneHotEncoder with "ignore" handling for unknown values.
    :param cat_features: List of names of categorical features.
      Used to apply specific transformations to these features.
    :param num_features: List of names of numerical features.
      Used to apply specific transformations to these features.
    :Return: Sklearn pipeline that includes preprocessing steps and the base model.

    """
    # Create the list of transformers
    cat_transformers = []
    num_transformers = []
    if impute and cat_features:
        cat_transformers.append(("imputer_cat", impute_model_cat))
    if impute and num_features:
        num_transformers.append(("imputer_num", impute_model_num))
    if scale and num_features:
        num_transformers.append(("scaler", scale_model))
    if encode and cat_features:
        cat_transformers.append(("encoder", encode_model))

    if not cat_transformers:
        cat_transformers.append(("passthrough", "passthrough"))
    if not num_transformers:
        num_transformers.append(("passthrough", "passthrough"))

    numeric_transformer = Pipeline(
        steps=num_transformers,
    )
    categorical_transformer = Pipeline(
        steps=cat_transformers,
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_features),
            ("cat", categorical_transformer, cat_features),
        ],
        verbose_feature_names_out=False,
    )

    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", base_model)])

    return pipeline