In [1]:
import sys
from pathlib import Path

import hydra
from hydra import compose, initialize
from omegaconf import DictConfig
import polars as pl
import altair as alt
from prefect import flow, get_run_logger
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import mlflow
import mlflow.sklearn

In [2]:
pl.Config.set_tbl_formatting('NOTHING')
pl.Config.set_float_precision(3)
pl.Config.set_fmt_str_lengths(20)
pl.Config.set_thousands_separator(True)
pl.Config.set_tbl_hide_column_data_types(False)
pl.Config.save_to_file("/Users/zacklarsen/Documents/Projects/kaggle/kaggle-paris-housing-prices/src/config/pl_config.json")

In [3]:
pl.Config.load_from_file("/Users/zacklarsen/Documents/Projects/kaggle/kaggle-paris-housing-prices/src/config/pl_config.json")

polars.config.Config

In [4]:
with initialize(version_base="1.3.2",
                config_path="../src/config",
                job_name="run_flow"):
    cfg = compose(config_name="main")

In [5]:
cfg

{'run': {'model': {'type': 'RandomForestRegressor', 'parameters': {'max_depth': 2, 'n_estimators': 100, 'random_state': 0}}}, 'target_var': 'price', 'train_test_split': {'random_state': 42, 'train_ratio': 0.75, 'validation_ratio': 0.125, 'test_ratio': 0.125}, 'base_path': '/Users/zacklarsen/Documents/Projects/kaggle', 'competition_path': '${base_path}/kaggle-paris-housing-prices', 'data_path': '${base_path}/data/paris_housing_prices', 'raw_data_path': '${data_path}/raw', 'processed_data_path': '${data_path}/processed', 'model_path': '${competition_path}/models', 'paths': {'mlflow': {'mlruns': '${competition_path}/mlruns'}, 'data': {'train_raw': '${raw_data_path}/train.csv', 'test_raw': '${raw_data_path}/test.csv', 'train_typed': '${processed_data_path}/train_typed.parquet', 'test_typed': '${processed_data_path}/test_typed.parquet', 'train_clean': '${processed_data_path}/train_clean.parquet', 'X_train': '${processed_data_path}/X_train.parquet', 'X_train_transformed': '${processed_data_p

In [6]:
cfg['target_var']

'price'

In [7]:
cfg['train_test_split']

{'random_state': 42, 'train_ratio': 0.75, 'validation_ratio': 0.125, 'test_ratio': 0.125}

In [8]:
cfg['run']

{'model': {'type': 'RandomForestRegressor', 'parameters': {'max_depth': 2, 'n_estimators': 100, 'random_state': 0}}}

In [9]:
cfg['run']['model']['type']

'RandomForestRegressor'

In [10]:
cfg['run'].model.type

'RandomForestRegressor'

In [11]:
cfg.run.model.type

'RandomForestRegressor'

In [12]:
eval(cfg.run.model.type)

sklearn.ensemble._forest.RandomForestRegressor

In [13]:
model_type = eval(cfg.run.model.type)
parameters = cfg.run.model.get('parameters', {})

In [14]:
model_type

sklearn.ensemble._forest.RandomForestRegressor

In [15]:
parameters

{'max_depth': 2, 'n_estimators': 100, 'random_state': 0}

In [16]:
model = model_type(**parameters)

In [17]:
model

In [None]:
X_train_path = cfg.paths.data.X_train
y_train_path = cfg.paths.data.y_train
X_train = pl.read_parquet(X_train_path)
y_train = pl.read_parquet(y_train_path)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_train

In [None]:
alt.data_transformers.disable_max_rows()
# alt.data_transformers.enable("vegafusion")

alt.Chart(y_train.to_pandas()).mark_bar().encode(
    x=alt.X('price', bin=True),
    y=alt.Y('count()')
)

In [None]:
X_train

In [None]:
X_train.glimpse()

In [None]:
# mlflow.set_tracking_uri('file:///Users/zacklarsen/Documents/Projects/kaggle/kaggle-paris-housing-prices/mlruns')
# mlflow.set_experiment("Kaggle Paris Housing")
# with mlflow.start_run(run_name="LR_model"):
#     mlflow.sklearn.autolog()
#     model = LinearRegression()
#     model.fit(X_train.to_pandas(), y_train.to_pandas())
#     mlflow.sklearn.log_model(model, "LR_model")
# mlflow.end_run()