In [14]:
%pip install opendatasets

Note: you may need to restart the kernel to use updated packages.


In [41]:
import os
import mlflow
import tempfile
import opendatasets as od
import pandas as pd
import numpy as np
import pickle as pkl
from random import randint
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA, NMF
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import max_error, explained_variance_score, mean_absolute_error

In [16]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("HousePrice")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1664893595200, experiment_id='1', last_update_time=1664893595200, lifecycle_stage='active', name='HousePrice', tags={}>

## Cargamos los datos

In [17]:
od.download(
    "https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data"
    )

Skipping, found downloaded files in ".\house-prices-advanced-regression-techniques" (use force=True to force download)


In [18]:
train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")

## Exploracion de los datos

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [20]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Division train test

In [21]:
seed = randint(0,10000000)
X_train, X_test, y_train, y_test = train_test_split(train.loc[:, train.columns != "SalePrice"], train.SalePrice, random_state=seed)
X_finaltest = test.loc[:, test.columns != "SalePrice"]

## Creamos la etapa de preprocesamiento

In [22]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("ss", StandardScaler()),
    ("pca", PCA())
    ]
)

cat_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ("nmf", NMF())
    ]
)

column_transformer = ColumnTransformer([
    ("categorical", cat_pipeline, list(test.select_dtypes(exclude=np.number).columns)),
    ("numerical", num_pipeline, list(test.select_dtypes(np.number).columns))
    ]
)

## Instanciamos los modelos

### DecisionTree

In [23]:
modelo_tree = Pipeline([
    ("prepr", column_transformer),
    ("modelo", DecisionTreeRegressor())
    ]
)

tree_param_grid = {
    'prepr__numerical__pca__n_components': [15, None],
    'prepr__categorical__nmf__n_components':[15, None],
    #'modelo__ccp_alpha': [0.0, 0.1, 0.3, 0.5],
    #'modelo__criterion': ['squared_error', "friedman_mse"],
    #'modelo__max_depth': [3,4,5,None],
    #'modelo__max_features': ["sqrt", "log2", None],
}


### SupportVectorMachine

In [24]:
modelo_svm = Pipeline([
    ("prepr", column_transformer),
    ("modelo", SVR())
    ]
)

svm_param_grid = {
    'prepr__numerical__pca__n_components': [15, None],
    'prepr__categorical__nmf__n_components':[15, None],
    'modelo__C': [1.0, 0.5, 0.8, 0.2],
    'modelo__coef0': [0.0,  0.5],
    'modelo__epsilon': [0.1],
    'modelo__gamma': ['scale', 'auto'],
    'modelo__kernel': ['poly', 'rbf', 'sigmoid'],
}

### Random forest

In [25]:
modelo_forest = Pipeline([
    ("prepr", column_transformer),
    ("modelo", RandomForestRegressor())
    ]
)

forest_param_grid = {
    'prepr__numerical__pca__n_components': [15, None],
    'prepr__categorical__nmf__n_components':[15, None],
    #'modelo__ccp_alpha': [0.0],
    #'modelo__criterion': ["squared_error", "absolute_error", "poisson"],
    #'modelo__max_features': ["sqrt", "log2", 1.0],
    #'modelo__max_samples': [10, 20, 30, 40, None],
    #'modelo__min_samples_split': [1, 2, 3],
    #'modelo__n_estimators': [50,100,150,200],
    #'modelo__n_jobs': [-1],
}

## Instanciamos las búsquedas de hiperparámetros

In [26]:
tree_optimization = RandomizedSearchCV(modelo_tree, tree_param_grid, n_jobs=-1, scoring="neg_mean_absolute_error", n_iter=100) 
forest_optimization = RandomizedSearchCV(modelo_svm, svm_param_grid, n_jobs=-1, scoring="neg_mean_absolute_error", n_iter=100) 
svm_optimization = RandomizedSearchCV(modelo_forest, forest_param_grid, n_jobs=-1, scoring="neg_mean_absolute_error", n_iter=100)

## Realizamos los experimentos

In [49]:
def experimento(optimization):
    with mlflow.start_run():
        model_name = str(tree_optimization.estimator.named_steps["modelo"])[:-2]
        mlflow.log_param("model", model_name)
        mlflow.log_param("seed", seed)

        optimization.fit(X_train, y_train)

        best_params = optimization.best_estimator_.get_params()
        best_params = {k:best_params.get(k) for (k, v) in optimization.param_distributions.items()}
        mlflow.log_param("best_params", best_params)

        y_pred_train = optimization.best_estimator_.predict(X_train)
        y_pred_test = optimization.best_estimator_.predict(X_test)
        
        mlflow.log_metric("best_cv_score", optimization.best_score_)
        # Regression metrics
        ## Explained Variance
        mlflow.log_metric("explained_variance.train", explained_variance_score(y_train, y_pred_train))
        mlflow.log_metric("explained_variance.test", explained_variance_score(y_test, y_pred_test))
        mlflow.log_metric("explained_variance.train", explained_variance_score(y_train, y_pred_train))
        mlflow.log_metric("explained_variance.test", explained_variance_score(y_test, y_pred_test))
        ## Max Error
        mlflow.log_metric("max_error.train", max_error(y_train, y_pred_train))
        mlflow.log_metric("max_error.test", max_error(y_test, y_pred_test))
        mlflow.log_metric("max_error.train", max_error(y_train, y_pred_train))
        mlflow.log_metric("max_error.test", max_error(y_test, y_pred_test)) 
        ## Mean Absolute Error
        mlflow.log_metric("mean_absolute_error.train", mean_absolute_error(y_train, y_pred_train))
        mlflow.log_metric("mean_absolute_error.test", mean_absolute_error(y_test, y_pred_test))
        mlflow.log_metric("mean_absolute_error.train", mean_absolute_error(y_train, y_pred_train))
        mlflow.log_metric("mean_absolute_error.test", mean_absolute_error(y_test, y_pred_test)) 


        mlflow.sklearn.log_model(optimization, model_name+".pkl")

In [48]:
experimento(tree_optimization)



In [55]:
experimento(svm_optimization)

                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore')),
                                                                  ('nmf',
                                                                   NMF())]),
                                                  ['MSZoning', 'Street...`


KeyboardInterrupt: 

In [57]:
experimento(forest_optimization)

                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore')),
                                                                  ('nmf',
                                                                   NMF())]),
                                                  ['MSZoning', 'Street...`


In [None]:
# TODO: cambiar gridsearch por halvinggridsearch