In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import category_encoders as ce
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import ShuffleSplit, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, PowerTransformer

from sklearn.impute import SimpleImputer

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import RFECV
import sys


pd.options.display.max_rows = 999

In [2]:
train_raw, test_raw = pd.read_csv('~/house-prices/data/train.csv'), pd.read_csv('~/house-prices/data/test.csv')

date_related = ['GarageYrBlt', 'MoSold', 'YrSold']
boolean = ['CentralAir']
target = 'SalePrice'
to_drop = ['Id', 'PoolArea', 'PoolQC']

numerical = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
             'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'OverallCond', 'OverallQual', 'YearBuilt', 'YearRemodAdd'] + date_related

categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
              'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
              'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']  + boolean

assert len(categorical )+ len(numerical) + len(to_drop) + 1 == train_raw.shape[1]
assert set(categorical + numerical + date_related + boolean + [target] + to_drop) == set(train_raw.columns)

train = train_raw.copy()
train.SalePrice = np.log(train.SalePrice)
test = test_raw.copy()


In [3]:
y = train.pop('SalePrice')

train.Functional.fillna(train.Functional.mode()[0], inplace=True)       
train.Utilities.fillna(train.Utilities.mode()[0], inplace=True)          
train.Exterior1st.fillna(train.Exterior1st.mode()[0], inplace=True)        
train.SaleType.fillna(train.SaleType.mode()[0], inplace=True)                
train.KitchenQual.fillna(train.KitchenQual.mode()[0], inplace=True)        
train.Electrical.fillna(train.Electrical.mode()[0], inplace=True)  

test.Functional.fillna(train.Functional.mode()[0], inplace=True)       
test.Utilities.fillna(train.Utilities.mode()[0], inplace=True)          
test.Exterior1st.fillna(train.Exterior1st.mode()[0], inplace=True)        
test.SaleType.fillna(train.SaleType.mode()[0], inplace=True)                
test.KitchenQual.fillna(train.KitchenQual.mode()[0], inplace=True)        
test.Electrical.fillna(train.Electrical.mode()[0], inplace=True)  

train[categorical] = train[categorical].fillna("Missing")
test[categorical] = test[categorical].fillna("Missing")

train.Alley = train.Alley.map({'Missing':0, 'Grvl':1, 'Pave':2})
train.Street = train.Street.map({'Grvl':0, 'Pave':1})
train.ExterQual = train.ExterQual.map({'Po': 0, 'Fa': 1, 'Gd': 2, 'TA': 3, 'Ex': 4})
train.ExterCond = train.ExterCond.map({'Po': 0, 'Fa': 1, 'Gd': 2, 'TA': 3, 'Ex': 4})
train.GarageCond = train.GarageCond.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
train.GarageQual = train.GarageQual.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
train.BsmtQual = train.BsmtQual.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
train.BsmtCond = train.BsmtCond.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
train.BsmtExposure = train.BsmtExposure.map({'Missing': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, 'Ex': 5})
train.HeatingQC= train.HeatingQC.map({'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3})
train.KitchenQual = train.KitchenQual.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
train.PavedDrive = train.PavedDrive.map({'N': 0, 'P': 1, 'Y': 2})


test.Alley = test.Alley.map({'Missing':0, 'Grvl':1, 'Pave':2})
test.Street = test.Street.map({'Grvl':0, 'Pave':1})
test.ExterQual = test.ExterQual.map({'Po': 0, 'Fa': 1, 'Gd': 2, 'TA': 3, 'Ex': 4})
test.ExterCond = test.ExterCond.map({'Po': 0, 'Fa': 1, 'Gd': 2, 'TA': 3, 'Ex': 4})
test.GarageCond = test.GarageCond.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
test.GarageQual = test.GarageQual.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
test.BsmtQual = test.BsmtQual.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
test.BsmtCond = test.BsmtCond.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
test.BsmtExposure = test.BsmtExposure.map({'Missing': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, 'Ex': 5})
test.HeatingQC= test.HeatingQC.map({'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3})
test.KitchenQual = test.KitchenQual.map({'Missing': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
test.PavedDrive = test.PavedDrive.map({'N': 0, 'P': 1, 'Y': 2})


transformed_to_ordinal = ['Alley', 'Street', 'ExterQual', 'ExterCond', 'GarageCond', 'GarageQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                         'HeatingQC', 'KitchenQual', 'PavedDrive']


ordinal_numerical = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
                    'OverallCond', 'OverallQual', 'GarageYrBlt', 'YearBuilt', 'YearRemodAdd'] + transformed_to_ordinal

continuous_numerical = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 
                      'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'GarageArea']


train.MasVnrArea.fillna(0, inplace=True)    
train.BsmtHalfBath.fillna(0, inplace=True)
train.BsmtFullBath.fillna(0, inplace=True)
train.GarageArea.fillna(0, inplace=True)
train.GarageCars.fillna(0, inplace=True)    
train.TotalBsmtSF.fillna(0, inplace=True)   
train.BsmtUnfSF.fillna(0, inplace=True)     
train.BsmtFinSF2.fillna(0, inplace=True)    
train.BsmtFinSF1.fillna(0, inplace=True)

test.MasVnrArea.fillna(0, inplace=True)    
test.BsmtHalfBath.fillna(0, inplace=True)
test.BsmtFullBath.fillna(0, inplace=True)
test.GarageArea.fillna(0, inplace=True)
test.GarageCars.fillna(0, inplace=True)    
test.TotalBsmtSF.fillna(0, inplace=True)   
test.BsmtUnfSF.fillna(0, inplace=True)     
test.BsmtFinSF2.fillna(0, inplace=True)    
test.BsmtFinSF1.fillna(0, inplace=True)

train['LivibleRatio'] = train['1stFlrSF'] / train['LotArea']
train['HasMasVnr'] = train['MasVnrArea'] == 0
train['HasPool'] = train['PoolArea'] > 0
train['BuiltPercentage'] = train['1stFlrSF'] / train['LotArea']
train['DiffBuitSold'] = train['YearBuilt'] - train['YrSold']
train['AvrRoomSF'] = train['TotRmsAbvGrd'] / (train['1stFlrSF'] + train['2ndFlrSF'])
train['2ndfloorPercentage'] = train['2ndFlrSF'] / train['1stFlrSF']
train['HasBasment'] = train['TotalBsmtSF'] > 0
train['DiffRemodBuilt'] = train['YearRemodAdd'] - train['YearBuilt']

test['LivibleRatio'] = test['1stFlrSF'] / test['LotArea']
test['HasMasVnr'] = test['MasVnrArea'] == 0
test['HasPool'] = test['PoolArea'] > 0
test['BuiltPercentage'] = test['1stFlrSF'] / test['LotArea']
test['DiffBuitSold'] = test['YearBuilt'] - test['YrSold']
test['AvrRoomSF'] = test['TotRmsAbvGrd'] / (test['1stFlrSF'] + test['2ndFlrSF'])
test['2ndfloorPercentage'] = test['2ndFlrSF'] / test['1stFlrSF']
test['HasBasment'] = test['TotalBsmtSF'] > 0
test['DiffRemodBuilt'] = test['YearRemodAdd'] - test['YearBuilt']


train, test = train.drop(to_drop, axis=1), test.drop(to_drop, axis=1)
# engineered = ['LivibleRatio', 'HasPool', 'BuiltPercentage', 'DiffBuitSold', 'AvrRoomSF', '2ndfloorPercentage', 'DiffRemodBuilt']

# categorical += ['HasPool', 'DiffRemodBuilt']
# continuous_numerical += ['LivibleRatio', 'BuiltPercentage', '2ndfloorPercentage']
# ordinal_numerical += ['DiffBuitSold', 'AvrRoomSF']

to_encode = list((set(categorical) - set(transformed_to_ordinal)) - set(to_drop))


In [4]:
def rmse(clf, X, y):
    return - np.sqrt(mean_squared_error(clf.predict(X), y))

train, test = train[to_encode + ordinal_numerical + continuous_numerical], test[to_encode + ordinal_numerical + continuous_numerical]

encoder = ce.OneHotEncoder(cols=to_encode).fit(train, y)
train, test = encoder.transform(train), encoder.transform(test)

train.LotFrontage.fillna(train.LotFrontage.median(), inplace=True)
test.LotFrontage.fillna(train.LotFrontage.median(), inplace=True)

train.GarageYrBlt.fillna(train.GarageYrBlt.min(), inplace=True)
test.GarageYrBlt.fillna(train.GarageYrBlt.min(), inplace=True)



# imputer = SimpleImputer(strategy='median')
# imputer.fit(train[continuous_numerical + ordinal_numerical])
# train[continuous_numerical + ordinal_numerical], test[continuous_numerical + ordinal_numerical] = imputer.transform(train[continuous_numerical + ordinal_numerical]), imputer.transform(test[continuous_numerical + ordinal_numerical])

# imputer = SimpleImputer(strategy='median')
# imputer.fit(train[ordinal_numerical])
# train[ordinal_numerical], test[ordinal_numerical] = imputer.transform(train[ordinal_numerical]), imputer.transform(test[ordinal_numerical])


transformer = PowerTransformer(standardize=False).fit(train[continuous_numerical] + 1)
train[continuous_numerical ], test[continuous_numerical] = transformer.transform(train[continuous_numerical] + 1), transformer.transform(test[continuous_numerical] + 1)

scaler = RobustScaler().fit(train[continuous_numerical + ordinal_numerical])
train[continuous_numerical+ ordinal_numerical]  = pd.DataFrame(scaler.transform(train[continuous_numerical + ordinal_numerical]))
test[continuous_numerical + ordinal_numerical] = pd.DataFrame(scaler.transform(test[continuous_numerical + ordinal_numerical]))

  elif pd.api.types.is_categorical(cols):


In [5]:
X_train, y_train, X_test = train.copy(), y.copy(), test.copy()

In [7]:
from scipy.stats import uniform
# ridge = KernelRidge()

# parameters = {'alpha': uniform(0.05, 1.0), 'kernel': ['polynomial'], 
#               'degree': [2, 3, 4, 5, 6], 'coef0':uniform(0.5, 5)}

# random_search = RandomizedSearchCV(estimator = ridge,
#                                    param_distributions = parameters,
#                                    n_iter = 1000,
#                                    cv = 3,
#                                    scoring = rmse,
#                                    n_jobs = -1,
#                                    random_state=0)

# random_search = random_search.fit(X_train, y_train)

# print("Parameters of the best_estimator:")
# print(random_search.best_params_)
# print("Mean cross-validated RMSE of the best_estimator: {}".format(-random_search.best_score_))
# model = random_search.best_estimator_
# print("RMSE of the whole training set: {}".format(rmse(model, X_train, y_train)))

In [18]:
from sklearn.svm import SVR
model = SVR()

parameters = {'C': uniform(0, 1), 'kernel': ['poly'], 
              'degree': [4, 5], 'coef0':uniform(0.3, 2), 'epsilon': uniform(0.0, 0.1)}

random_search = RandomizedSearchCV(estimator = model,
                                   param_distributions = parameters,
                                   n_iter = 1000,
                                   cv = 3,
                                   scoring = rmse,
                                   n_jobs = -1,
                                   random_state=0)

random_search = random_search.fit(X_train, y_train)

print("Parameters of the best_estimator:")
print(random_search.best_params_)
print("Mean cross-validated RMSE of the best_estimator: {}".format(-random_search.best_score_))
model = random_search.best_estimator_
print("RMSE of the whole training set: {}".format(rmse(model, X_train, y_train)))

params1 = {'C': 0.07373975986879407, 'coef0': 0.9636581318368607, 'degree': 4, 'epsilon': 0.01108952633785413, 'kernel': 'poly'}

Parameters of the best_estimator:
{'C': 0.06263600305980976, 'coef0': 0.7838034084029697, 'degree': 4, 'epsilon': 0.02606286668926199, 'kernel': 'poly'}
Mean cross-validated RMSE of the best_estimator: 0.12067995771991703
RMSE of the whole training set: -0.0531158738922949


In [15]:
params1 = {'C': 0.1, 'coef0': 0.9636581318368607, 'degree': 4, 'epsilon': 0.05, 'kernel': 'poly'}
model1 = SVR(**params1)

model1.fit(X_train, y_train)
print(rmse(model1, X_train, y_train))

result = cross_val_score(model1, X_train, y_train, scoring=rmse, cv=3)
print(f"results: {result}\nmean: {result.mean()}")

-0.05139744025600057
results: [-0.11575297 -0.14094977 -0.11211848]
mean: -0.12294040655404603


In [46]:
from sklearn.svm import SVR
model = XGBRegressor()

parameters = lgbm_params = {'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}


random_search = RandomizedSearchCV(estimator = model,
                                   param_distributions = parameters,
                                   n_iter = 1000,
                                   cv = 3,
                                   scoring = rmse,
                                   n_jobs = -1,
                                   random_state=0)

random_search = random_search.fit(X_train, y_train)

print("Parameters of the best_estimator:")
print(random_search.best_params_)
print("Mean cross-validated RMSE of the best_estimator: {}".format(-random_search.best_score_))
model = random_search.best_estimator_
print("RMSE of the whole training set: {}".format(rmse(model, X_train, y_train)))

Parameters of the best_estimator:
{'colsample_bytree': 0.4, 'eta': 0.200156280875612, 'gamma': 0.012430839519815795, 'lambda': 0.8026803083089148, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 1.0}
Mean cross-validated RMSE of the best_estimator: 0.12752620441377335
RMSE of the whole training set: -0.05606817342366912


In [20]:
from lightgbm import LGBMRegressor
model1 = LGBMRegressor()

model1.fit(X_train, y_train)
print(rmse(model1, X_train, y_train))

result = cross_val_score(model1, X_train, y_train, scoring=rmse, cv=3)
print(f"results: {result}\nmean: {result.mean()}")

In [25]:
from lightgbm import LGBMRegressor
from scipy.stats import randint as sp_randint

model = LGBMRegressor()

parameters = {'num_leaves': sp_randint(20, 80), 
#              'min_child_samples': sp_randint(10, 200), 
#              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': uniform(0.05, 0.), 
             'colsample_bytree': uniform(0.05, 0.5),
#              'reg_alpha': uniform(0, 1),
             'reg_lambda': uniform(0, 5),
             'max_depth': [4, 5],
             'learning_rate': uniform(0.001, 0.1),
             'n_estimators': [1000],
             'n_jobs': [-1]}



random_search = RandomizedSearchCV(estimator = model,
                                   param_distributions = parameters,
                                   n_iter = 1000,
                                   cv = 3,
                                   scoring = rmse,
                                   n_jobs = -1,
                                   random_state=0)

random_search = random_search.fit(X_train, y_train)

print("Parameters of the best_estimator:")
print(random_search.best_params_)
print("Mean cross-validated RMSE of the best_estimator: {}".format(-random_search.best_score_))
model = random_search.best_estimator_
print("RMSE of the whole training set: {}".format(rmse(model, X_train, y_train)))

Parameters of the best_estimator:
{'colsample_bytree': 0.2704551297120967, 'learning_rate': 0.0259541747442841, 'max_depth': 4, 'n_estimators': 1000, 'n_jobs': -1, 'num_leaves': 40, 'reg_lambda': 1.7511085076197463, 'subsample': 0.19487610506997383}
Mean cross-validated RMSE of the best_estimator: 0.12295396502843592
RMSE of the whole training set: -0.06766679306759317


In [None]:
{'colsample_bytree': 0.2704551297120967, 'learning_rate': 0.0259541747442841, 'max_depth': 4, 'n_estimators': 1000, 'n_jobs': -1, 'num_leaves': 40, 'reg_lambda': 1.7511085076197463, 'subsample': 0.19487610506997383}
 0.12295396502843592
RMSE of the whole training set: -0.06766679306759317

In [46]:
# Try a stacked model
from sklearn.ensemble import StackingRegressor
params1 = {'colsample_bynode': 0.14798080984524117, 'eta': 0.06661119263308864, 'max_depth': 2, 'n_estimators': 1000, 'n_jobs': -1, 'objective': 'reg:squarederror', 'reg_lambda': 7.474287731445637}
model1 = XGBRegressor(**params1)

params2 = {'alpha': 0.6713925902023894, 'coef0': 1.9930683130133582, 'degree': 4, 'kernel': 'polynomial'}
model2 = KernelRidge(**params2)

model3 = MLPRegressor(hidden_layer_sizes=(128, 256, 256), random_state=1, max_iter=10_000, solver='lbfgs', alpha=4,
                     learning_rate_init=0.1, verbose=False, n_iter_no_change=100, early_stopping=True)

params4 = {'C': 0.06263600305980976, 'coef0': 0.7838034084029697, 'degree': 4, 'epsilon': 0.02606286668926199, 'kernel': 'poly'}
model4 = SVR(**params4)

params5 = {'colsample_bytree': 0.2704551297120967, 'learning_rate': 0.0259541747442841, 'max_depth': 4, 'n_estimators': 1000, 'n_jobs': -1, 'num_leaves': 40, 'reg_lambda': 1.7511085076197463, 'subsample': 0.19487610506997383}
model5 = LGBMRegressor(**params5)

stackings = [('xgb', model1), ('kernel ridge', model2), ('nn', model3), ('svr', model4), ('lgbm', model5)]
model = StackingRegressor(stackings, n_jobs=-1)

In [None]:
result = cross_val_score(model1, X_train, y_train, scoring=rmse, cv=3)
print(f"results: {result}\nmean: {result.mean()}")

result = cross_val_score(model2, X_train, y_train, scoring=rmse, cv=3)
print(f"results: {result}\nmean: {result.mean()}")

result = cross_val_score(model3, X_train, y_train, scoring=rmse, cv=3)
print(f"results: {result}\nmean: {result.mean()}")

result = cross_val_score(model, X_train, y_train, scoring=rmse, cv=3)
print(f"results: {result}\nmean: {result.mean()}")

In [47]:
# # Make predictions on the test set
model.fit(X_train, y_train)
y_pred = np.exp(model.predict(X_test))
output = pd.DataFrame({'Id': test_raw['Id'], 'SalePrice': y_pred})
output.to_csv('../submissions/v12.2- NN  Ridge stacked.csv', index=False)