---

In [1]:

import sys
sys.path.append("../../../")
from utils.mlflow_flow import set_tracking, quick_log_and_register
from utils.utils_yose import load_data, make_features

from ensemble import EnsembleModel
import numpy as np

import joblib

import dagshub
import mlflow
import os

from mlflow.tracking import MlflowClient

from warnings import filterwarnings
from dotenv import load_dotenv

dagshub.init(repo_owner='Yosesotomayor', repo_name='retoCasas_v2', mlflow=True)

load_dotenv()
ENDPOINT_URL = os.getenv("MLFLOW_TRACKING_URI")
ALIAS = os.getenv("MODEL_ALIAS")
MODEL_NAME = os.getenv("MODEL_NAME")
os.getenv("MLFLOW_TRACKING_URI")
os.getenv("MLFLOW_REGISTRY_URI")
os.getenv("DAGSHUB_TOKEN")

os.environ['MLFLOW_TRACKING_TOKEN'] = os.getenv("DAGSHUB_TOKEN")
os.environ.setdefault("MLFLOW_HTTP_REQUEST_TIMEOUT", "60")
os.environ.setdefault("MLFLOW_HTTP_REQUEST_MAX_RETRIES", "0")

filterwarnings("ignore")
set_tracking(ENDPOINT_URL)

sub_dir = "../../../data/housing_data/"
df_train, df_test = load_data(sub_dir = sub_dir)

y = np.log1p(df_train["SalePrice"]).astype(float)
rstate = 42

X = df_train.drop(["SalePrice", "Id"], axis=1)
X = make_features(X)

X_test = df_test.drop(["Id"], axis=1)
X_test = make_features(X_test)

---

In [3]:
model = EnsembleModel(rstate=rstate)
model.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4472
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 191
[LightGBM] [Info] Start training from score 12.022444
CV RMSE mean: 0.0768
CV RMSE std: 0.0000
CV R2 mean: 0.9549
CV MSE mean: 0.0059
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4573
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 196
[LightGBM] [Info] Start training from score 12.024057


<ensemble.EnsembleModel at 0x148bc6ab0>

---

In [4]:
from sklearn.inspection import permutation_importance
import pandas as pd

perm = permutation_importance(model.lgbm, X, y, n_repeats=10, random_state=rstate)
sorted_idx = perm.importances_mean.argsort()
perm_2 = permutation_importance(model.elasticnet, X, y, n_repeats=10, random_state=rstate)
sorted_idx_2 = perm_2.importances_mean.argsort()
df = pd.DataFrame({
    "feature": X.columns[sorted_idx],
    "importance": perm.importances_mean[sorted_idx],
    "feature_elnet": X.columns[sorted_idx_2],
    "importance_elnet": perm_2.importances_mean[sorted_idx_2]
}).sort_values(by="importance", ascending=False).reset_index(drop=True)
df.head(15)

Unnamed: 0,feature,importance,feature_elnet,importance_elnet
0,OverallQual_GrLivArea,0.27781,TotalSF,0.225963
1,TotalSF,0.065699,OverallQual,0.053798
2,OverallQual,0.022171,GrLivArea,0.031761
3,GarageCars,0.013117,OverallCond,0.024832
4,OverallCond,0.01288,Neighborhood,0.0241
5,LotArea,0.008121,LotArea,0.018567
6,TotalBath,0.006644,GarageCars,0.014773
7,KitchenQual,0.005369,TotalBsmtSF,0.014485
8,TotalBsmtSF,0.005355,MSZoning,0.011113
9,BsmtFinSF1,0.004667,RoofMatl,0.008311


In [None]:
top_features = df["feature"].tolist()[:14]

X_top = X[top_features]
model_feature_selection = model.fit(X_top, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1723
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 36
[LightGBM] [Info] Start training from score 12.022444
CV RMSE mean: 0.0902
CV RMSE std: 0.0000
CV R2 mean: 0.9379
CV MSE mean: 0.0081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1730
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 38
[LightGBM] [Info] Start training from score 12.024057


In [7]:
predictions = model_feature_selection.predict(X_test)
df_sub = pd.DataFrame({"Id": df_test["Id"], "SalePrice": predictions})
df_sub.to_csv("../../../data/housing_data_submissions/elnet_lgbm/submission_elnet_lgbm-kaggle-top.csv", index=False)

In [14]:
df_top = df.head(14)[['feature', 'importance']]
df_top.drop(index=[9, 8], inplace=True)
df_top

Unnamed: 0,feature,importance
0,OverallQual_GrLivArea,0.27781
1,TotalSF,0.065699
2,OverallQual,0.022171
3,GarageCars,0.013117
4,OverallCond,0.01288
5,LotArea,0.008121
6,TotalBath,0.006644
7,KitchenQual,0.005369
10,GarageArea,0.004576
11,HouseAge,0.004099


In [25]:
selected_features = df_top["feature"].tolist()
X_final = X[selected_features]
X_test_final = X_test[selected_features]
model_final = model.fit(X_final, y)

predictions = model_final.predict(X_test_final)
df_sub = pd.DataFrame({"Id": df_test["Id"], "SalePrice": predictions})
df_sub.to_csv("../../../data/housing_data_submissions/elnet_lgbm/submission_elnet_lgbm-kaggle-final.csv", index=False)

joblib.dump(model_final.elasticnet, "./model/elasticnet.pkl")
joblib.dump(model_final.lgbm, "./model/lgbm.pkl")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1177
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 16
[LightGBM] [Info] Start training from score 12.022444
CV RMSE mean: 0.0950
CV RMSE std: 0.0000
CV R2 mean: 0.9310
CV MSE mean: 0.0090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1182
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 17
[LightGBM] [Info] Start training from score 12.024057


['./model/lgbm.pkl']

In [30]:
import json
model_final.weights

with open('./model/weights.json', 'w') as f:
    json.dump(model_final.weights, f)


---

In [24]:
# dagshub.init(repo_owner = "Yosesotomayor", repo_name = "retoCasas_v2", mlflow=True)

# model.rmse_std = 0.0
# metrics = model.get_metrics()

# joblib.dump(model_final.elasticnet, "./model/elasticnet.pkl")
# joblib.dump(model_final.lgbm, "./model/lgbm.pkl")

# MODEL_NAME = "MODELO_APP"

# quick_log_and_register(
#     experiment="Housing_Competition",
#     run_name="MODELO_APP",
#     model=model_final,
#     X=X_final, y=y,
#     model_name=MODEL_NAME,
#     set_challenger=True,
#     X_test=X_test_final,
#     params=model_final.get_params(),
#     metrics=model_final.get_metrics(),
#     tags={"model_final": "elnet_lgbm",
#           'rmse': model_final.rmse
#           },
#     artifacts = {
#         "elnet": "./model/elasticnet.pkl",
#         "lgbm" : "./model/lgbm.pkl"
#     }
# )


---

In [None]:
"""
print("[URIs]")
print("Tracking URI :", mlflow.get_tracking_uri())
print("Registry URI :", mlflow.get_registry_uri())
print("MLFLOW_TRACKING_TOKEN:", bool(os.getenv("MLFLOW_TRACKING_TOKEN")))
print("DAGSHUB_TOKEN        :", bool(os.getenv("DAGSHUB_TOKEN")))
print("MLFLOW_TRACKING_USERNAME:", os.getenv("MLFLOW_TRACKING_USERNAME"))
print()

client = MlflowClient()

mv = client.get_model_version_by_alias(MODEL_NAME, ALIAS)
run_id = mv.run_id
print(f"[Modelo] {MODEL_NAME}@{ALIAS} -> version={mv.version}")
print("source           :", mv.source)
print("storage_location :", getattr(mv, "storage_location", None))
"""

'\nprint("[URIs]")\nprint("Tracking URI :", mlflow.get_tracking_uri())\nprint("Registry URI :", mlflow.get_registry_uri())\nprint("MLFLOW_TRACKING_TOKEN:", bool(os.getenv("MLFLOW_TRACKING_TOKEN")))\nprint("DAGSHUB_TOKEN        :", bool(os.getenv("DAGSHUB_TOKEN")))\nprint("MLFLOW_TRACKING_USERNAME:", os.getenv("MLFLOW_TRACKING_USERNAME"))\nprint()\n\nclient = MlflowClient()\n\nmv = client.get_model_version_by_alias(MODEL_NAME, ALIAS)\nrun_id = mv.run_id\nprint(f"[Modelo] {MODEL_NAME}@{ALIAS} -> version={mv.version}")\nprint("source           :", mv.source)\nprint("storage_location :", getattr(mv, "storage_location", None))\n'

---