In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore
import seaborn as sns

In [2]:
data = pd.read_csv('../data/train/train.csv')

In [3]:
def add_features(dataset):
    dataset['houseAge'] = dataset['YrSold'] - dataset['YearBuilt']
    dataset['houseRemodAge'] = dataset['YrSold'] - dataset['YearRemodAdd']
    dataset['totalBaths'] = dataset['BsmtFullBath'] + dataset['FullBath'] + 0.5* (dataset['HalfBath'] + dataset['BsmtHalfBath'])
    dataset['porchDeckArea'] = dataset['WoodDeckSF'] + dataset['OpenPorchSF'] + dataset['EnclosedPorch'] + dataset['3SsnPorch'] + dataset['ScreenPorch']
    dataset['totalCoveredArea'] = dataset['GrLivArea'] + dataset['TotalBsmtSF']

    return dataset

In [4]:
def drop_features(dataset):
    #drop columns with little que-dar?
    dataset = dataset.drop(columns=['Id','Alley','MasVnrType','BsmtCond','PoolQC','Fence',
                                    'MiscFeature','GarageQual','GarageCond', 'BsmtFinType2'])
    
    #drop columns used in add_features
    dataset = dataset.drop(columns=['YrSold','YearBuilt','YearRemodAdd','BsmtFullBath',
                                   'FullBath','HalfBath','BsmtHalfBath','WoodDeckSF',
                                   'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch',
                                    'BsmtFinSF1','BsmtFinSF2','1stFlrSF','2ndFlrSF','GrLivArea',
                                   'TotalBsmtSF','GarageYrBlt','GarageArea'])
    #drop columns no-representatives

    return dataset

In [5]:
def fill_null_values(dataset):
    string_columns_with_nulls = ['FireplaceQu','GarageFinish','GarageType',
                                 'BsmtExposure','BsmtFinType1','BsmtQual','Electrical']
    dataset[string_columns_with_nulls] = dataset[string_columns_with_nulls].fillna("No")
    
    numeric_columns_with_nulls = ['LotFrontage','MasVnrArea']
    dataset[numeric_columns_with_nulls] = dataset[numeric_columns_with_nulls].fillna(0)

    return dataset

In [6]:
def feature_engineering(dataset):
    dataset_final = dataset.copy()
    dataset_final = add_features(dataset_final)
    dataset_final = drop_features(dataset_final)
    dataset_final = fill_null_values(dataset_final)
    
    if "SalePrice" in dataset_final.columns:
        dataset_final["SalePrice"] = np.log1p(dataset_final["SalePrice"])
    
    return dataset_final

In [7]:
data_train = feature_engineering(data)

In [8]:
data_train.select_dtypes(include=['object']).columns.tolist()

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinType1',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [9]:
# Algunos serán OneHotEncoder otros OrdinalEncoder
# Yo borré estas y el chico, no  'GarageQual', 'MasVnrType'
# OneHotEncoder:
ohe_cols = ['Street','Neighborhood','Condition1','Condition2','RoofMatl','Exterior1st',
               'Exterior2nd','Foundation','Heating','Electrical','GarageType', 'RoofStyle',
               'SaleType','SaleCondition','LotConfig','BldgType','HouseStyle', 'MSZoning']

# OrdinalEncoder 
ore_cols = ['LotShape','LandContour','Utilities','LandSlope','ExterQual','ExterCond',
               'BsmtQual','BsmtExposure','BsmtFinType1','HeatingQC','CentralAir','KitchenQual',
               'Functional','FireplaceQu','GarageFinish','PavedDrive']


In [10]:
number_cols = data_train.select_dtypes(include=['number']).columns
number_cols = number_cols.drop('SalePrice')

In [11]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.compose import make_column_transformer, ColumnTransformer

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [12]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [13]:
#Mirar como funcionan estos pipelines y que implican estas configuraciones
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [14]:
ore_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ore', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [15]:
col_trans = ColumnTransformer(transformers=[
    ('num_pipe', num_pipeline, number_cols),
    ('ore_pipe', ore_pipeline, ore_cols),
    ('ohe_pipe', ohe_pipeline, ohe_cols),
    ],
    remainder='passthrough', 
    n_jobs=-1)

In [16]:
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

In [17]:
X = data_train.drop('SalePrice', axis=1)
y = data_train['SalePrice']

In [18]:
X_prepro = pipeline.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_prepro, y, test_size=0.2, random_state=25)

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [21]:
mean_squared_error(y_test, y_pred_lr) #0.013550181855841515

3.4662948822258463e+18

In [22]:
from sklearn.ensemble import RandomForestRegressor #, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [23]:
rfr = RandomForestRegressor(random_state=13)
param_grid_rfr = {
    'max_depth': [12, 15, 18],
    'n_estimators': [250, 500, 750],
    'min_samples_split': [3, 5, 10]
}
rfr_cv = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rfr_cv.fit(X_train, y_train)

# Ojo, primero me daba: RandomForestRegressor(max_depth=15, min_samples_split=3, n_estimators=500,random_state=13)
# Y despues: RandomForestRegressor(max_depth=18, min_samples_split=3, n_estimators=750,random_state=13)
# Que diferencia de resultados y rendimiento equivale?

In [24]:
best_rmse = np.sqrt(-1 * rfr_cv.best_score_) 
print(f"Best RFR MRSE result: {round(best_rmse, 4)}")

best_rfr_model = rfr_cv.best_estimator_
y_pred_rfr = best_rfr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_rfr)
rmse = np.sqrt(mse)
print(f"Test RFR MRSE result: {round(rmse, 4)}")

#Best RFR MRSE result: 0.1488
#Test RFR MRSE result: 0.1345

Best RFR MRSE result: 0.1487
Test RFR MRSE result: 0.1347


In [25]:
from xgboost import XGBRegressor

In [26]:
xgb = XGBRegressor(random_state=13)

param_grid_xgb = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

xgb_cv = GridSearchCV(
    xgb, 
    param_grid_xgb, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1)

xgb_cv.fit(X_train, y_train)

In [27]:
best_xgb_rmse = np.sqrt(-1 * xgb_cv.best_score_) 
print(f"Best XGB MRSE result: {round(best_xgb_rmse, 4)}")

best_xgb_model = xgb_cv.best_estimator_
y_pred_xgb = best_xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, y_pred_xgb)
xgb_rmse = np.sqrt(xgb_mse)
print(f"Test XGB MRSE result: {round(xgb_rmse, 4)}")

Best XGB MRSE result: 0.1349
Test XGB MRSE result: 0.1201


In [28]:
ridge = Ridge()

param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

ridge_cv.fit(X_train, y_train)



In [29]:
best_ridge_rmse = np.sqrt(-1 * ridge_cv.best_score_) 
print(f"Best Ridge MRSE result: {round(best_ridge_rmse, 4)}")

best_ridge_model = ridge_cv.best_estimator_
y_pred_ridge = best_ridge_model.predict(X_test)
ridge_rmse = mean_squared_error(y_test, y_pred_ridge)
ridge_rmse = np.sqrt(ridge_rmse)
print(f"Test Ridge MRSE result: {round(ridge_rmse , 4)}")

Best Ridge MRSE result: 0.1494
Test Ridge MRSE result: 0.1219


In [30]:
gbr = GradientBoostingRegressor()

param_grid_gbr = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_features': [0.01, 0.1, 0.7]
}

gbr_cv = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbr_cv.fit(X_train, y_train)

In [31]:
gbr_cv.best_estimator_

#GradientBoostingRegressor(learning_rate=0.01, max_depth=15, max_features=0.1, min_samples_leaf=25, n_estimators=1000)

In [32]:
best_gbr_rmse = np.sqrt(-1 * gbr_cv.best_score_) 
print(f"Best GBR MRSE result: {round(best_gbr_rmse, 4)}")

best_gbr_model = gbr_cv.best_estimator_
y_pred_gbr = best_gbr_model.predict(X_test)
gbr_rmse = mean_squared_error(y_test, y_pred_gbr)
gbr_rmse = np.sqrt(gbr_rmse)
print(f"Test GBR MRSE result: {round(gbr_rmse, 4)}")

Best GBR MRSE result: 0.1302
Test GBR MRSE result: 0.1195


In [33]:
import lightgbm as lgb

In [34]:
lgbm_regressor = lgb.LGBMRegressor()

param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

lgbm_cv = GridSearchCV(
    lgbm_regressor, 
    param_grid_lgbm, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

lgbm_cv.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1570
[LightGBM] [Info] Number of data points in the train set: 778, number of used features: 97
[LightGBM] [Info] Start training from score 12.031757
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 779, number of used features: 98
[LightGBM] [Info] Start training from score 12.004354
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins

In [35]:
best_lgbm_rmse = np.sqrt(-1 * lgbm_cv.best_score_) 
print(f"Best LGBM MRSE result: {round(best_lgbm_rmse, 4)}")

best_lgbm_model = lgbm_cv.best_estimator_
y_pred_lgbm = best_lgbm_model.predict(X_test)
lgbm_rmse = mean_squared_error(y_test, y_pred_lgbm)
lgbm_rmse = np.sqrt(lgbm_rmse)
print(f"Test LGBM MRSE result: {round(lgbm_rmse, 4)}")

Best LGBM MRSE result: 0.1403
Test LGBM MRSE result: 0.1276


In [36]:
from catboost import CatBoostRegressor

In [37]:
catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

param_grid_cat ={
    'iterations': [100, 500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.5]
}

cat_cv = GridSearchCV(catboost, param_grid_cat, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
cat_cv.fit(X_train, y_train)

[LightGBM] [Info] Number of data points in the train set: 779, number of used features: 98
[LightGBM] [Info] Start training from score 12.004354
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1570
[LightGBM] [Info] Number of data points in the train set: 778, number of used features: 97
[LightGBM] [Info] Start training from score 12.031757


  _data = np.array(data, dtype=dtype, copy=copy,


In [38]:
best_cat_cv_rmse = np.sqrt(-1 * cat_cv.best_score_) 
print(f"Best CatBoost MRSE result: {round(best_cat_cv_rmse, 4)}")

best_cat_cv_model = cat_cv.best_estimator_
y_pred_cat_cv = best_cat_cv_model.predict(X_test)
cat_cv_rmse = mean_squared_error(y_test, y_pred_cat_cv)
cat_cv_rmse = np.sqrt(cat_cv_rmse)
print(f"Test CatBoost MRSE result: {round(cat_cv_rmse, 4)}")

Best CatBoost MRSE result: 0.1308
Test CatBoost MRSE result: 0.1138


In [39]:
print(f"cat_cv_rmse: {round(cat_cv_rmse, 4)}")
print(f"lgbm_rmse: {round(lgbm_rmse, 4)}")
print(f"xgb_rmse: {round(xgb_rmse, 4)}")
print(f"ridge_rmse: {round(ridge_rmse, 4)}")
print(f"gbr_rmse: {round(gbr_rmse, 4)}")

cat_cv_rmse: 0.1138
lgbm_rmse: 0.1276
xgb_rmse: 0.1201
ridge_rmse: 0.1219
gbr_rmse: 0.1195


In [40]:
vr = VotingRegressor([('gbr', gbr_cv.best_estimator_),
                      ('xgb', xgb_cv.best_estimator_),
                      ('ridge', ridge_cv.best_estimator_),
                      ],
                    weights=[2,2,1])

vr.fit(X_train, y_train)

In [41]:
y_pred_vr = vr.predict(X_test)
mean_squared_error(y_test, y_pred_vr, squared=False)

# weights=[3,2,1] :: 0.1141, 0.1149
# weights=[2,2,1] :: 0.1156, 0.1139
# weights=[2,1,2] :: 0.1146, 0.1145



0.11449268852017046

In [42]:
estimators = [
    ('gbr', gbr_cv.best_estimator_),
    ('xgb', xgb_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgb', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]

In [43]:
stackreg = StackingRegressor(
            estimators = estimators,
            final_estimator = vr
)

In [44]:
stackreg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 110
[LightGBM] [Info] Start training from score 12.016441
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1640
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 103
[LightGBM] [Info] Start training from score 12.022511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1631
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 102
[LightGBM] [Info] Start tra

In [45]:
y_pred_stack = stackreg.predict(X_test)
mean_squared_error(y_test, y_pred_stack, squared=False)

#0.1190100021681962



0.11993004543181375

In [49]:
dtest = pd.read_csv('../data/test/test.csv')
data_test_f = feature_engineering(dtest)

In [50]:
X_test_f = pipeline.transform(data_test_f)

In [None]:
y_test_f = np.exp(stackreg.predict(X_test_f))

result = dtest[['Id']]
result['SalePrice'] = y_test_f

In [54]:
result.to_csv('../predictions/submission_v1_20240109.csv', index=False)

#score:  0.13011 => 21%