In [1]:
import sys
from pathlib import Path

import hydra
from hydra import compose, initialize
from omegaconf import DictConfig
import polars as pl
from prefect import flow, get_run_logger
from sklearn.linear_model import LinearRegression
import mlflow
import mlflow.sklearn
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnx import save_model as save_model_onnx

In [2]:
with initialize(version_base="1.3.2",
                config_path="../src/config",
                job_name="run_flow"):
    cfg = compose(config_name="main")

In [3]:
cfg['run']

{'keep_columns': ['Income', 'Recency', 'NumWebVisitsMonth', 'Complain', 'age', 'total_purchases', 'enrollment_years', 'family_size'], 'remove_outliers_threshold': {'age': 90, 'income': 600000}}

In [4]:
X_train_path = cfg.paths.data.X_train
y_train_path = cfg.paths.data.y_train
X_train = pl.read_parquet(X_train_path)
y_train = pl.read_parquet(y_train_path)

In [5]:
X_train.shape

(17032, 21)

In [6]:
y_train.shape

(17032, 1)

In [7]:
X_train.glimpse()

Rows: 17032
Columns: 21
$ age                      <i64> 23, 23, 17, 16, 30, 25, 14, 5, 24, 9
$ attic                    <i64> 7652, 6386, 2909, 9349, 1227, 6431, 810, 7505, 1841, 2336
$ attic_z_score_abs        <f64> 0.834380303335952, 0.42565303801806165, 0.6968941195730165, 1.3822556344927135, 1.2399267106193763, 0.44018125834926625, 1.3745548856885395, 0.7869214502540169, 1.0416972154336066, 0.8818867917903556
$ basement                 <i64> 111, 7438, 3726, 438, 2272, 1555, 274, 1244, 815, 8889
$ basement_z_score_abs     <f64> 1.6975248822235078, 0.7070416790250115, 0.5111582766668821, 1.5902103925922446, 0.9883303498285235, 1.2236345977356053, 1.6440317268415632, 1.3256982254888863, 1.4664869595922867, 1.1832292155845312
$ cityCode                 <i64> 12300, 55157, 68416, 69322, 10826, 47070, 85121, 56506, 75957, 19270
$ cityPartRange            <i64> 5, 4, 5, 5, 6, 5, 8, 6, 10, 4
$ floors                   <i64> 42, 87, 60, 49, 79, 39, 49, 18, 37, 60
$ floors_z_score_abs     

In [9]:
mlflow.set_tracking_uri("sqlite:///mlruns/mlflow.db")
mlflow.set_experiment("Kaggle Paris Housing")
with mlflow.start_run(run_name="LR_model"):
    mlflow.sklearn.autolog()
    model = LinearRegression()
    model.fit(X_train.to_pandas(), y_train.to_pandas())
    # mlflow.sklearn.log_model(model, "LR_model")  # TODO: Figure out if this is necessary; does the autolog log the model as well?
mlflow.end_run()

