In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [10]:
import my_preprocessors as mypp #nuestra libraria

In [11]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
#Cast de Variable MSSubClass
data['MSSubClass'] = data['MSSubClass'].astype('O')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['Id', 'SalePrice'], axis=1),
        data['SalePrice'],
        test_size=0.1,
        random_state=2022)

X_train.shape, X_test.shape

((1314, 79), (146, 79))

In [14]:
### Transformación al Target
y_train = np.log(y_train)
y_test = np.log(y_test)

## Configuración del Machine Learning Pipeline

In [42]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['BsmtQual', 'BsmtExposure',
                                     'BsmtFinType1', 'GarageFinish']

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = ['FireplaceQu']


#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['LotFrontage']


#Variables de temporalidad
TEMPORAL_VARS = ['YearRemodAdd']

REF_VAR = "YrSold"

#Varaibles que vamos a tirar
DROP_FEATURES = ["YrSold"]

#Varibles para transformación logaritmia
NUMERICALS_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = ['ScreenPorch']

#Variables para hacer mapeo categorico por codificación ordinal
QUAL_VARS = ['ExterQual', 'BsmtQual',
             'HeatingQC', 'KitchenQual', 'FireplaceQu']

EXPOSURE_VARS = ['BsmtExposure']

FINISH_VARS = ['BsmtFinType1']

GARAGE_VARS = ['GarageFinish']

FENCE_VARS = ['Fence']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['MSSubClass',  'MSZoning',  'LotShape',  'LandContour',
                    'LotConfig', 'Neighborhood', 'RoofStyle', 'Exterior1st',
                    'Foundation', 'CentralAir', 'Functional', 'PavedDrive',
                    'SaleCondition']

#Mapeos de variables categoricas
QUAL_MAPPINGS = {'Po': 1, 'Fa': 2, 'TA': 3,
                 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}

EXPOSURE_MAPPINGS = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

FINISH_MAPPINGS = {'Missing': 0, 'NA': 0, 'Unf': 1,
                   'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

GARAGE_MAPPINGS = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

#Variables seleccionadas según análisis de Lasso
FEATURES = [
    'MSSubClass',
    'MSZoning',
    'LotFrontage',
    'LotShape',
    'LandContour',
    'LotConfig',
    'Neighborhood',
    'OverallQual',
    'OverallCond',
    'YearRemodAdd',
    'RoofStyle',
    'Exterior1st',
    'ExterQual',
    'Foundation',
    'BsmtQual',
    'BsmtExposure',
    'BsmtFinType1',
    'HeatingQC',
    'CentralAir',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'BsmtFullBath',
    'HalfBath',
    'KitchenQual',
    'TotRmsAbvGrd',
    'Functional',
    'Fireplaces',
    'FireplaceQu',
    'GarageFinish',
    'GarageCars',
    'GarageArea',
    'PavedDrive',
    'WoodDeckSF',
    'ScreenPorch',
    'SaleCondition',
    "YrSold",
]

In [46]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]

## Machine Learing PipeLine

In [47]:
housePrice_pipeline = Pipeline([
    
    #============= IMPUTACIONES ===================#
    
    #1. Imputación de varaibles categoricas
    ('missing_imputation', 
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de variables categoricas con NA basado en frequiencia.
    ('frequent_imputation', 
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicamos Faltante en variables numéricas para imputar
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #4. Imputación de mediana para variables categoricas
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= VARIABLES TEMPORALES ==================
    
    #5. Tratamiento de variables temporales
    ('eslapsed_time', mypp.TremporalVariableTransformer(
        variables=TEMPORAL_VARS, reference_variable=REF_VAR)
    ),
    
    #6. Drop de variables
    ('drop_features', DropFeatures(features_to_drop=DROP_FEATURES)),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #7. Transformación logaritmica
    ('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #8. Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_quality', mypp.Mapper(
        variables=QUAL_VARS, mappings=QUAL_MAPPINGS)),
    
    ('mapper_exposure', mypp.Mapper(
        variables=EXPOSURE_VARS, mappings=EXPOSURE_MAPPINGS)),
    
    ('mapper_garage', mypp.Mapper(
        variables=GARAGE_VARS, mappings=GARAGE_MAPPINGS)),
    
    ('mapper_finish', mypp.Mapper(
        variables=FINISH_VARS, mappings=FINISH_MAPPINGS)),
    
    #============ CODIFICACION DE VARIABLES CATEGORICAS NOMINALES ============
    
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=1, variables=CATEGORICAL_VARS)),
    
    ('categorical_encoder', OrdinalEncoder(
        encoding_method='ordered', variables=CATEGORICAL_VARS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    ('Lasso', Lasso(alpha=0.01, random_state=2022)),
]) 

In [48]:
housePrice_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation',
                 CategoricalImputer(variables=['FireplaceQu'])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['BsmtQual', 'BsmtExposure',
                                               'BsmtFinType1',
                                               'GarageFinish'])),
                ('missing_indicator',
                 AddMissingIndicator(variables=['LotFrontage'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method=...
                                             'Foundation', 'CentralAir',
                                             'Functional', 'PavedDrive',
                                             'SaleCondition'])),
                ('categorical_encoder',
                 OrdinalEncoder(variables=['MSSubClass', 'MSZoning', 'LotShape',
                                           'LandContour',

In [53]:
#Seleccionamos variables para predicción
X_test = X_test[FEATURES]

In [54]:
preds = housePrice_pipeline.predict(X_test)

In [55]:
from sklearn.metrics import mean_squared_error 

In [58]:
mean_squared_error(np.exp(y_test), np.exp(preds), squared=False)

47955.84316867136

In [60]:
X_test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotShape,LandContour,LotConfig,Neighborhood,OverallQual,OverallCond,YearRemodAdd,...,Fireplaces,FireplaceQu,GarageFinish,GarageCars,GarageArea,PavedDrive,WoodDeckSF,ScreenPorch,SaleCondition,YrSold
306,60,RL,116.0,Reg,Lvl,Inside,SawyerW,7,5,1991,...,1,TA,RFn,3,746,Y,127,0,Normal,2007
626,20,RL,,IR1,Lvl,Inside,NAmes,5,5,1978,...,1,TA,RFn,1,286,Y,0,0,Normal,2007
546,50,RL,70.0,IR1,Bnk,Inside,BrkSide,6,7,1950,...,1,Gd,Unf,2,440,Y,0,0,Normal,2007
275,50,RL,55.0,Reg,Lvl,Inside,BrkSide,7,7,2007,...,0,,Unf,2,672,Y,74,144,Normal,2009
267,75,RL,60.0,Reg,Bnk,Inside,SWISU,5,8,1997,...,1,Gd,Unf,1,240,Y,262,0,Normal,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,180,RM,21.0,Reg,Lvl,Inside,MeadowV,4,6,1972,...,0,,,0,0,Y,96,0,Normal,2010
1144,190,RL,60.0,Reg,Lvl,Inside,Edwards,4,4,1950,...,0,,Unf,1,280,Y,0,0,Normal,2010
825,20,RL,114.0,Reg,Lvl,Inside,NridgHt,10,5,2008,...,1,Gd,Fin,3,1220,Y,188,0,Partial,2008
1249,20,RL,60.0,Reg,Lvl,Inside,NAmes,5,7,1950,...,0,,Unf,1,276,Y,0,0,Normal,2007


In [61]:
import joblib

In [62]:
#Guardamos pipeline
joblib.dump(housePrice_pipeline, 'housePrice_pipeline.pkl')

['housePrice_pipeline.pkl']

In [63]:
X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotShape,LandContour,LotConfig,Neighborhood,OverallQual,OverallCond,YearRemodAdd,...,Fireplaces,FireplaceQu,GarageFinish,GarageCars,GarageArea,PavedDrive,WoodDeckSF,ScreenPorch,SaleCondition,YrSold
699,120,FV,59.0,IR2,Lvl,Inside,Somerst,7,5,2004,...,0,,RFn,2,530,Y,156,0,Normal,2008
609,20,RL,61.0,Reg,Lvl,Inside,Sawyer,4,5,1961,...,0,,Unf,1,261,Y,64,0,Normal,2007
1453,20,RL,90.0,Reg,Lvl,Inside,Mitchel,5,5,2006,...,0,,,0,0,Y,36,0,Abnorml,2006
807,70,RL,144.0,Reg,Lvl,Inside,BrkSide,5,6,2004,...,1,TA,RFn,2,528,Y,0,0,Normal,2009
149,50,RM,,Reg,Lvl,Inside,BrkSide,5,4,1950,...,0,,Unf,1,240,Y,200,0,Normal,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,20,FV,75.0,Reg,Lvl,Inside,Somerst,8,5,2008,...,0,,RFn,2,750,Y,144,0,Normal,2010
624,60,RL,80.0,Reg,Lvl,Inside,NWAmes,6,5,1972,...,1,TA,RFn,2,484,Y,148,147,Normal,2006
173,20,RL,80.0,IR1,Lvl,Inside,NAmes,6,5,1961,...,1,TA,Unf,2,504,Y,0,0,Normal,2008
1244,70,RL,,IR1,HLS,Corner,Crawfor,8,7,1950,...,2,Gd,Unf,2,400,Y,0,0,Normal,2006
