In [1]:
import sys
from pathlib import Path

import hydra
from hydra import compose, initialize
from omegaconf import DictConfig
import polars as pl
from prefect import flow, get_run_logger
from sklearn.linear_model import LinearRegression
import mlflow
import mlflow.sklearn
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from onnx import save_model as save_model_onnx

In [16]:
pl.Config.set_tbl_formatting('NOTHING')
pl.Config.set_float_precision(3)
pl.Config.set_fmt_str_lengths(20)
pl.Config.set_thousands_separator(True)
pl.Config.set_tbl_hide_column_data_types(False)
pl.Config.save_to_file("/Users/zacklarsen/Documents/Projects/kaggle/kaggle-paris-housing-prices/src/config/pl_config.json")

In [17]:
pl.Config.load_from_file("/Users/zacklarsen/Documents/Projects/kaggle/kaggle-paris-housing-prices/src/config/pl_config.json")

polars.config.Config

In [2]:
with initialize(version_base="1.3.2",
                config_path="../src/config",
                job_name="run_flow"):
    cfg = compose(config_name="main")

In [3]:
cfg['run']

{'keep_columns': ['Income', 'Recency', 'NumWebVisitsMonth', 'Complain', 'age', 'total_purchases', 'enrollment_years', 'family_size'], 'remove_outliers_threshold': {'age': 90, 'income': 600000}}

In [4]:
X_train_path = cfg.paths.data.X_train
y_train_path = cfg.paths.data.y_train
X_train = pl.read_parquet(X_train_path)
y_train = pl.read_parquet(y_train_path)

In [5]:
X_train.shape

(17032, 21)

In [6]:
y_train.shape

(17032, 1)

In [18]:
X_train

age,attic,attic_z_score_abs,basement,basement_z_score_abs,cityCode,cityPartRange,floors,floors_z_score_abs,garage,garage_z_score_abs,hasGuestRoom,hasPool,hasStorageRoom,hasStormProtector,hasYard,isNewBuilt,numPrevOwners,numberOfRooms,squareMeters,squareMeters_z_score_abs
i64,i64,f64,i64,f64,i64,i64,i64,f64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64
23,7652,0.834,111,1.698,12300,5,42,0.111,940,1.490,9,0,1,1,1,1,10,6,29570,0.343
23,6386,0.426,7438,0.707,55157,4,87,0.831,576,0.166,0,0,1,1,0,0,10,21,99227,1.063
17,2909,0.697,3726,0.511,68416,5,60,0.266,838,1.119,5,0,0,0,0,1,9,86,51744,0.104
16,9349,1.382,438,1.590,69322,5,49,0.036,516,0.053,6,0,0,0,0,1,7,78,84387,0.763
30,1227,1.240,2272,0.988,10826,6,79,0.663,558,0.100,8,0,1,0,1,1,5,25,7788,0.783
25,6431,0.440,1555,1.224,47070,5,39,0.174,573,0.155,1,0,1,0,0,1,9,22,43215,0.068
14,810,1.375,274,1.644,85121,8,49,0.036,278,0.919,4,1,1,1,1,1,2,24,76231,0.599
5,7505,0.787,1244,1.326,56506,6,18,0.613,551,0.075,10,0,1,0,0,1,6,18,9329,0.752
24,1841,1.042,815,1.466,75957,10,37,0.216,433,0.355,6,0,0,1,0,1,3,84,26089,0.414
9,2336,0.882,8889,1.183,19270,4,60,0.266,533,0.009,7,1,0,0,0,0,2,76,61761,0.307


In [7]:
X_train.glimpse()

Rows: 17032
Columns: 21
$ age                      <i64> 23, 23, 17, 16, 30, 25, 14, 5, 24, 9
$ attic                    <i64> 7652, 6386, 2909, 9349, 1227, 6431, 810, 7505, 1841, 2336
$ attic_z_score_abs        <f64> 0.834380303335952, 0.42565303801806165, 0.6968941195730165, 1.3822556344927135, 1.2399267106193763, 0.44018125834926625, 1.3745548856885395, 0.7869214502540169, 1.0416972154336066, 0.8818867917903556
$ basement                 <i64> 111, 7438, 3726, 438, 2272, 1555, 274, 1244, 815, 8889
$ basement_z_score_abs     <f64> 1.6975248822235078, 0.7070416790250115, 0.5111582766668821, 1.5902103925922446, 0.9883303498285235, 1.2236345977356053, 1.6440317268415632, 1.3256982254888863, 1.4664869595922867, 1.1832292155845312
$ cityCode                 <i64> 12300, 55157, 68416, 69322, 10826, 47070, 85121, 56506, 75957, 19270
$ cityPartRange            <i64> 5, 4, 5, 5, 6, 5, 8, 6, 10, 4
$ floors                   <i64> 42, 87, 60, 49, 79, 39, 49, 18, 37, 60
$ floors_z_score_abs     

In [19]:
# mlflow.set_tracking_uri("sqlite:///mlruns/mlflow.db")
mlflow.set_tracking_uri('file:///Users/zacklarsen/Documents/Projects/kaggle/kaggle-paris-housing-prices/mlruns')
mlflow.set_experiment("Kaggle Paris Housing")
with mlflow.start_run(run_name="LR_model"):
    mlflow.sklearn.autolog()
    model = LinearRegression()
    model.fit(X_train.to_pandas(), y_train.to_pandas())
    # mlflow.sklearn.log_model(model, "LR_model")  # TODO: Figure out if this is necessary; does the autolog log the model as well?
mlflow.end_run()

2023/11/26 09:08:31 INFO mlflow.tracking.fluent: Experiment with name 'Kaggle Paris Housing' does not exist. Creating a new experiment.
