---

In [1]:

import sys
sys.path.append("../../../")
from utils.mlflow_flow import set_tracking, quick_log_and_register
from utils.utils_yose import load_data, make_features

from ensemble import EnsembleModel
import numpy as np

import joblib

import dagshub
import mlflow
import os

from mlflow.tracking import MlflowClient

from warnings import filterwarnings
from dotenv import load_dotenv

dagshub.init(repo_owner='Yosesotomayor', repo_name='retoCasas_v2', mlflow=True)

load_dotenv()
ENDPOINT_URL = os.getenv("MLFLOW_TRACKING_URI")
ALIAS = os.getenv("MODEL_ALIAS")
MODEL_NAME = os.getenv("MODEL_NAME")
os.getenv("MLFLOW_TRACKING_URI")
os.getenv("MLFLOW_REGISTRY_URI")
os.getenv("DAGSHUB_TOKEN")

os.environ['MLFLOW_TRACKING_TOKEN'] = os.getenv("DAGSHUB_TOKEN")
os.environ.setdefault("MLFLOW_HTTP_REQUEST_TIMEOUT", "60")
os.environ.setdefault("MLFLOW_HTTP_REQUEST_MAX_RETRIES", "0")

filterwarnings("ignore")
set_tracking(ENDPOINT_URL)

sub_dir = "../../../data/housing_data/"
df_train, df_test = load_data(sub_dir = sub_dir)

y = np.log1p(df_train["SalePrice"]).astype(float)
rstate = 42

X = df_train.drop(["SalePrice", "Id"], axis=1)
X = make_features(X)

X_test = df_test.drop(["Id"], axis=1)
X_test = make_features(X_test)

---

In [2]:
model = EnsembleModel(rstate=rstate)
model.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4472
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 191
[LightGBM] [Info] Start training from score 12.022444
CV RMSE mean: 0.0768
CV RMSE std: 0.0000
CV R2 mean: 0.9549
CV MSE mean: 0.0059
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4573
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 196
[LightGBM] [Info] Start training from score 12.024057


<ensemble.EnsembleModel at 0x1639b7f20>

---

In [3]:
from sklearn.inspection import permutation_importance
import pandas as pd

perm = permutation_importance(model.lgbm, X, y, n_repeats=10, random_state=rstate)
sorted_idx = perm.importances_mean.argsort()
perm_2 = permutation_importance(model.elasticnet, X, y, n_repeats=10, random_state=rstate)
sorted_idx_2 = perm_2.importances_mean.argsort()
df = pd.DataFrame({
    "feature": X.columns[sorted_idx],
    "importance": perm.importances_mean[sorted_idx],
    "feature_elnet": X.columns[sorted_idx_2],
    "importance_elnet": perm_2.importances_mean[sorted_idx_2]
}).sort_values(by="importance", ascending=False).reset_index(drop=True)
df.head(15)

Unnamed: 0,feature,importance,feature_elnet,importance_elnet
0,OverallQual_GrLivArea,0.27781,TotalSF,0.225963
1,TotalSF,0.065699,OverallQual,0.053798
2,OverallQual,0.022171,GrLivArea,0.031761
3,GarageCars,0.013117,OverallCond,0.024832
4,OverallCond,0.01288,Neighborhood,0.0241
5,LotArea,0.008121,LotArea,0.018567
6,TotalBath,0.006644,GarageCars,0.014773
7,KitchenQual,0.005369,TotalBsmtSF,0.014485
8,TotalBsmtSF,0.005355,MSZoning,0.011113
9,BsmtFinSF1,0.004667,RoofMatl,0.008311


In [4]:
top_features = df["feature"].tolist()[:14]

X_top = X[top_features]
model_feature_selection = model.fit(X_top, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1687
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 18
[LightGBM] [Info] Start training from score 12.022444
CV RMSE mean: 0.0887
CV RMSE std: 0.0000
CV R2 mean: 0.9399
CV MSE mean: 0.0079
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1692
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 19
[LightGBM] [Info] Start training from score 12.024057


In [5]:
predictions = model_feature_selection.predict(X_test)
df_sub = pd.DataFrame({"Id": df_test["Id"], "SalePrice": predictions})
df_sub.to_csv("../../../data/housing_data_submissions/elnet_lgbm/submission_elnet_lgbm-kaggle-top.csv", index=False)

In [6]:
df_top = df.head(14)[['feature', 'importance']]
df_top.drop(index=[9, 8], inplace=True)
df_top

Unnamed: 0,feature,importance
0,OverallQual_GrLivArea,0.27781
1,TotalSF,0.065699
2,OverallQual,0.022171
3,GarageCars,0.013117
4,OverallCond,0.01288
5,LotArea,0.008121
6,TotalBath,0.006644
7,KitchenQual,0.005369
10,GarageArea,0.004576
11,HouseAge,0.004099


In [15]:
X[top_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   OverallQual_GrLivArea  1460 non-null   int64  
 1   TotalSF                1460 non-null   int64  
 2   OverallQual            1460 non-null   int64  
 3   GarageCars             1460 non-null   int64  
 4   OverallCond            1460 non-null   int64  
 5   LotArea                1460 non-null   int64  
 6   TotalBath              1460 non-null   float64
 7   KitchenQual            1460 non-null   int64  
 8   TotalBsmtSF            1460 non-null   int64  
 9   BsmtFinSF1             1460 non-null   int64  
 10  GarageArea             1460 non-null   int64  
 11  HouseAge               1460 non-null   int64  
 12  SaleCondition          1460 non-null   object 
 13  MSZoning               1460 non-null   object 
dtypes: float64(1), int64(11), object(2)
memory usage: 159.8+

In [24]:
print("====")
for col in top_features:
    print(col, ":", X[col].nunique())
    print("====")

====
OverallQual_GrLivArea : 1133
====
TotalSF : 963
====
OverallQual : 10
====
GarageCars : 5
====
OverallCond : 9
====
LotArea : 1073
====
TotalBath : 10
====
KitchenQual : 4
====
TotalBsmtSF : 721
====
BsmtFinSF1 : 637
====
GarageArea : 441
====
HouseAge : 122
====
SaleCondition : 6
====
MSZoning : 5
====


In [27]:
for col in top_features:
    # sample values
    print("=====")
    print(col, ":")
    print(X[col].sample(3))
print("=====")

=====
OverallQual_GrLivArea :
1113     5040
85      19336
281      7572
Name: OverallQual_GrLivArea, dtype: int64
=====
TotalSF :
2       2706
733     2008
1264    2742
Name: TotalSF, dtype: int64
=====
OverallQual :
99      4
1186    3
796     6
Name: OverallQual, dtype: int64
=====
GarageCars :
927    2
137    3
576    1
Name: GarageCars, dtype: int64
=====
OverallCond :
334     5
1434    5
1209    5
Name: OverallCond, dtype: int64
=====
LotArea :
519    10918
264     5232
612    11885
Name: LotArea, dtype: int64
=====
TotalBath :
388     2.0
614     2.0
1098    1.0
Name: TotalBath, dtype: float64
=====
KitchenQual :
391    4
170    3
901    3
Name: KitchenQual, dtype: int64
=====
TotalBsmtSF :
313     2136
1225     588
1270    1332
Name: TotalBsmtSF, dtype: int64
=====
BsmtFinSF1 :
978     552
1045      0
579       0
Name: BsmtFinSF1, dtype: int64
=====
GarageArea :
1194    299
1171    433
611     564
Name: GarageArea, dtype: int64
=====
HouseAge :
656    49
228    43
343     3
Name

In [18]:
X[top_features].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
OverallQual_GrLivArea,1460.0,9673.956164,5186.744876,334.0,5790.0,8820.0,12180.0,56420.0
TotalSF,1460.0,2567.04863,821.714421,334.0,2009.5,2474.0,3004.0,11752.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
GarageCars,1460.0,1.767123,0.747315,0.0,1.0,2.0,2.0,4.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
TotalBath,1460.0,2.210616,0.785399,1.0,2.0,2.0,2.5,6.0
KitchenQual,1460.0,3.511644,0.66376,2.0,3.0,3.0,4.0,5.0
TotalBsmtSF,1460.0,1057.429452,438.705324,0.0,795.75,991.5,1298.25,6110.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [7]:
# selected_features = df_top["feature"].tolist()
# X_final = X[selected_features]
# X_test_final = X_test[selected_features]
# model_final = model.fit(X_final, y)

# predictions = model_final.predict(X_test_final)
# df_sub = pd.DataFrame({"Id": df_test["Id"], "SalePrice": predictions})
# df_sub.to_csv("../../../data/housing_data_submissions/elnet_lgbm/submission_elnet_lgbm-kaggle-final.csv", index=False)

# joblib.dump(model_final.elasticnet, "./model/elasticnet.pkl")
# joblib.dump(model_final.lgbm, "./model/lgbm.pkl")


---

In [8]:
# dagshub.init(repo_owner = "Yosesotomayor", repo_name = "retoCasas_v2", mlflow=True)

# model.rmse_std = 0.0
# metrics = model.get_metrics()

# joblib.dump(model_final.elasticnet, "./model/elasticnet.pkl")
# joblib.dump(model_final.lgbm, "./model/lgbm.pkl")

# MODEL_NAME = "MODELO_APP"

# quick_log_and_register(
#     experiment="Housing_Competition",
#     run_name="MODELO_APP",
#     model=model_final,
#     X=X_final, y=y,
#     model_name=MODEL_NAME,
#     set_challenger=True,
#     X_test=X_test_final,
#     params=model_final.get_params(),
#     metrics=model_final.get_metrics(),
#     tags={"model_final": "elnet_lgbm",
#           'rmse': model_final.rmse
#           },
#     artifacts = {
#         "elnet": "./model/elasticnet.pkl",
#         "lgbm" : "./model/lgbm.pkl"
#     }
# )


---

In [9]:
"""
print("[URIs]")
print("Tracking URI :", mlflow.get_tracking_uri())
print("Registry URI :", mlflow.get_registry_uri())
print("MLFLOW_TRACKING_TOKEN:", bool(os.getenv("MLFLOW_TRACKING_TOKEN")))
print("DAGSHUB_TOKEN        :", bool(os.getenv("DAGSHUB_TOKEN")))
print("MLFLOW_TRACKING_USERNAME:", os.getenv("MLFLOW_TRACKING_USERNAME"))
print()

client = MlflowClient()

mv = client.get_model_version_by_alias(MODEL_NAME, ALIAS)
run_id = mv.run_id
print(f"[Modelo] {MODEL_NAME}@{ALIAS} -> version={mv.version}")
print("source           :", mv.source)
print("storage_location :", getattr(mv, "storage_location", None))
"""

'\nprint("[URIs]")\nprint("Tracking URI :", mlflow.get_tracking_uri())\nprint("Registry URI :", mlflow.get_registry_uri())\nprint("MLFLOW_TRACKING_TOKEN:", bool(os.getenv("MLFLOW_TRACKING_TOKEN")))\nprint("DAGSHUB_TOKEN        :", bool(os.getenv("DAGSHUB_TOKEN")))\nprint("MLFLOW_TRACKING_USERNAME:", os.getenv("MLFLOW_TRACKING_USERNAME"))\nprint()\n\nclient = MlflowClient()\n\nmv = client.get_model_version_by_alias(MODEL_NAME, ALIAS)\nrun_id = mv.run_id\nprint(f"[Modelo] {MODEL_NAME}@{ALIAS} -> version={mv.version}")\nprint("source           :", mv.source)\nprint("storage_location :", getattr(mv, "storage_location", None))\n'

---