In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore
import seaborn as sns

In [2]:
data = pd.read_csv('../data/train/train.csv')

In [3]:
def add_features(dataset):
    dataset['houseAge'] = dataset['YrSold'] - dataset['YearBuilt']
    dataset['houseRemodAge'] = dataset['YrSold'] - dataset['YearRemodAdd']
    dataset['totalBaths'] = dataset['BsmtFullBath'] + dataset['FullBath'] + 0.5* (dataset['HalfBath'] + dataset['BsmtHalfBath'])
    dataset['porchDeckArea'] = dataset['WoodDeckSF'] + dataset['OpenPorchSF'] + dataset['EnclosedPorch'] + dataset['3SsnPorch'] + dataset['ScreenPorch']
    dataset['totalCoveredArea'] = dataset['GrLivArea'] + dataset['TotalBsmtSF']

    return dataset

In [4]:
def drop_features(dataset):
    #drop columns with little que-dar?
    dataset = dataset.drop(columns=['Id','Alley','MasVnrType','BsmtCond','PoolQC','Fence',
                                    'MiscFeature','GarageQual','GarageCond', 'BsmtFinType2'])
    
    #drop columns used in add_features
    dataset = dataset.drop(columns=['YrSold','YearBuilt','YearRemodAdd','BsmtFullBath',
                                   'FullBath','HalfBath','BsmtHalfBath','WoodDeckSF',
                                   'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
                                    'BsmtFinSF1','BsmtFinSF2','1stFlrSF','2ndFlrSF','GrLivArea',
                                   'TotalBsmtSF','GarageYrBlt','GarageArea'])
    #drop columns no-representatives

    return dataset

In [5]:
def fill_null_values(dataset):
    string_columns_with_nulls = ['FireplaceQu','GarageFinish','GarageType',
                                 'BsmtExposure','BsmtFinType1','BsmtQual','Electrical']
    dataset[string_columns_with_nulls] = dataset[string_columns_with_nulls].fillna("No")
    
    numeric_columns_with_nulls = ['LotFrontage','MasVnrArea']
    dataset[numeric_columns_with_nulls] = dataset[numeric_columns_with_nulls].fillna(0)

    return dataset

In [8]:
def feature_engineering(dataset):
    dataset_final = dataset.copy()
    dataset_final = add_features(dataset_final)
    dataset_final = drop_features(dataset_final)
    dataset_final = fill_null_values(dataset_final)
    dataset_final["SalePrice"] = np.log1p(dataset_final["SalePrice"])
    
    return dataset_final

In [11]:
data_train = feature_engineering(data)

In [37]:
data_train.select_dtypes(include=['object']).columns.tolist()

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinType1',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [13]:
# Algunos serán OneHotEncoder otros OrdinalEncoder
# Yo borré estas y el chico, no  'GarageQual', 'MasVnrType'
# OneHotEncoder:
ohe_cols = ['Street','Neighborhood','Condition1','Condition2','RoofMatl','Exterior1st',
               'Exterior2nd','Foundation','Heating','Electrical','GarageType', 'RoofStyle',
               'SaleType','SaleCondition','LotConfig','BldgType','HouseStyle', 'MSZoning']

# OrdinalEncoder 
ore_cols = ['LotShape','LandContour','Utilities','LandSlope','ExterQual','ExterCond',
               'BsmtQual','BsmtExposure','BsmtFinType1','HeatingQC','CentralAir','KitchenQual',
               'Functional','FireplaceQu','GarageFinish','PavedDrive']


In [54]:
number_cols = data_train.select_dtypes(include=['number']).columns
number_cols = number_cols.drop('SalePrice')

In [80]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.compose import make_column_transformer, ColumnTransformer

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [63]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [28]:
#Mirar como funcionan estos pipelines y que implican estas configuraciones
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [29]:
ore_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ore', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [64]:
col_trans = ColumnTransformer(transformers=[
    ('num_pipe', num_pipeline, number_cols),
    ('ore_pipe', ore_pipeline, ore_cols),
    ('ohe_pipe', ohe_pipeline, ohe_cols),
    ],
    remainder='passthrough', 
    n_jobs=-1)

In [65]:
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

In [69]:
X = data_train.drop('SalePrice', axis=1)
y = data_train['SalePrice']

In [70]:
X_prepro = pipeline.fit_transform(X)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_prepro, y, test_size=0.2, random_state=25)

In [75]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [78]:
mean_squared_error(y_test, y_pred_lr)

0.013550181855841515

In [82]:
from sklearn.ensemble import RandomForestRegressor #, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [87]:
rfr = RandomForestRegressor(random_state=13)
param_grid_rfr = {
    'max_depth': [12, 15, 18],
    'n_estimators': [250, 500, 750],
    'min_samples_split': [3, 5, 10]
}
rfr_cv = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rfr_cv.fit(X_train, y_train)

# Ojo, primero me daba: RandomForestRegressor(max_depth=15, min_samples_split=3, n_estimators=500,random_state=13)
# Y despues: RandomForestRegressor(max_depth=18, min_samples_split=3, n_estimators=750,random_state=13)
# Que diferencia de resultados y rendimiento equivale?

In [96]:
best_rmse = np.sqrt(-1 * rfr_cv.best_score_) 
print(f"Best RFR MRSE result: {round(best_rmse, 4)}")

best_rfr_model = rfr_cv.best_estimator_
y_pred_rfr = best_rfr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_rfr)
rmse = np.sqrt(mse)
print(f"Test RFR MRSE result: {round(rmse, 4)}")

Best RFR MRSE result: 0.1488
Test RFR MRSE result: 0.1345


In [97]:
best_rfr_model
