# Ames housing price prediction

## Initialization

In [None]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import mlflow

## Configuration

In [None]:
DATA_SET_URL = "https://raw.githubusercontent.com/janwillemkl/mlops-hands-on/main/data/ames_housing.csv"

FEATURES = {
    "nominal": ["ms_zoning", "lot_shape", "land_contour"],
    "ordinal": ["land_slope", "overall_qual", "overall_cond"],
    "numerical": ["lot_frontage", "lot_area", "mas_vnr_area"],
}
TARGET = "sale_price"

RANDOM_STATE = 42

MLFLOW_TRACKING_SERVER = "http://localhost:5000"
MLFLOW_EXPERIMENT = "ames-housing"

## Data set (ingestion & preprocessing)

In [None]:
raw_data = pd.read_csv(DATA_SET_URL)
raw_data.head()

In [None]:
columns = (
    FEATURES["nominal"]
    + FEATURES["ordinal"]
    + FEATURES["numerical"]
    + [TARGET]
)

features = raw_data[columns]
features.head()

## Model training

In [None]:
train_data, test_data = train_test_split(features, random_state=RANDOM_STATE)

train_input = train_data.drop([TARGET], axis=1)
train_output = train_data[TARGET]

test_input = test_data.drop([TARGET], axis=1)
test_output = test_data[TARGET]

In [None]:
# Ordinal pipeline
ordinal_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder()),
    ]
)

# Nominal pipeline
nominal_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Numerical pipeline
numerical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("encoder", StandardScaler()),
    ]
)

# Preprocessing
preprocessing_pipeline = ColumnTransformer(
    [
        ("ordinal_preprocessor", ordinal_pipeline, FEATURES["ordinal"]),
        ("nominal_preprocessor", nominal_pipeline, FEATURES["nominal"]),
        ("numerical_preprocessor", numerical_pipeline, FEATURES["numerical"]),
    ]
)

# Estimator
pipeline = Pipeline(
    [
        ("preprocessor", preprocessing_pipeline),
        ("estimator", GradientBoostingRegressor(n_estimators=250, random_state=RANDOM_STATE)),
    ]
)

In [None]:
# MLflow configuration

mlflow.set_tracking_uri(MLFLOW_TRACKING_SERVER)
mlflow.set_experiment(MLFLOW_EXPERIMENT)

mlflow.sklearn.autolog()

In [None]:
with mlflow.start_run():
    pipeline.fit(train_input, train_output)
    pipeline.score(test_input, test_output)

## Example prediction

In [None]:
example = pd.DataFrame([{
    "ms_zoning": "RL", 
    "lot_shape": "IR1", 
    "land_contour": "Lvl",
    "land_slope": "Gtl",
    "overall_qual": 6,
    "overall_cond": 5,
    "lot_frontage": 141.0,
    "lot_area": 31770,
    "mas_vnr_area": 112.0
}])

pipeline.predict(example)[0]