In [0]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [0]:
dados_treino = pd.read_csv('treino.csv', index_col='Id')
dados_teste = pd.read_csv('teste.csv', index_col='Id')

In [0]:
X = dados_treino.copy()

In [0]:
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

Dropando linhas com na do SalePrice (se tiver)

In [0]:
X.dropna(axis=0, subset=['SalePrice'], inplace=True)

In [0]:
y = X.SalePrice

In [0]:
X.drop(['SalePrice'], axis=1, inplace=True)

In [0]:
X_treino, X_valid, y_treino, y_valid = train_test_split(X, y, 
                                                        train_size=0.8, 
                                                        test_size=0.2, 
                                                        random_state=42)

In [0]:
cols_cat = [coluna for coluna in X_treino.columns
            if X_treino[coluna].nunique() < 10
            and X_treino[coluna].dtype == "object"]

In [0]:
cols_cat[:5]

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour']

In [0]:
cols_num = [coluna for coluna in X_treino.columns 
            if X_treino[coluna].dtype in ['int64', 'float64']]

In [0]:
cols_num[:5]

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']

In [0]:
cols = cols_cat + cols_num

In [0]:
X_treino_sel = X_treino[cols].copy()
X_valid_sel = X_valid[cols].copy()
X_teste_sel = dados_teste[cols].copy()

In [0]:
from sklearn.impute import SimpleImputer

In [0]:
transformer_num = SimpleImputer(strategy='median')

In [0]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [0]:
transformer_cat = Pipeline(
                    steps=[('imputacao', SimpleImputer(strategy='constant')),
                            ('encoding', OneHotEncoder(handle_unknown='ignore'))])

In [0]:
transformers = [('transformer_num',transformer_num, cols_num),
                ('transformer_cat',transformer_cat, cols_cat)]

In [0]:
from sklearn.compose import ColumnTransformer

In [0]:
preprocessador = ColumnTransformer(transformers)

In [0]:
modelo = RandomForestRegressor(n_estimators=50, random_state=42)

In [0]:
pipe = Pipeline(
    steps=[
        ('preprocessador', preprocessador),
        ('modelo', modelo)])

In [0]:
from sklearn.model_selection import cross_val_score

In [0]:
maes = -1 * cross_val_score(pipe, X,y, cv=5, scoring='neg_mean_absolute_error')

In [0]:
maes

array([18467.22787671, 17665.66541096, 18079.93541096, 16019.93712329,
       19475.70486301])

In [0]:
pipe.fit(X_treino_sel, y_treino)

Pipeline(memory=None,
         steps=[('preprocessador',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('transformer_num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                 

In [0]:
preds = pipe.predict(X_valid_sel)

In [0]:
mae = mean_absolute_error(y_valid, preds)

In [0]:
mae

18152.781643835613