In [1]:
import pandas as pd

train_data = pd.read_csv('datasets/train.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [2]:
import numpy as np

feat_corr = train_data.corr(numeric_only=True)['SalePrice']
id_corr_val = np.abs(feat_corr['Id'])
feat_corr_bool = feat_corr.between(-id_corr_val, id_corr_val)

# dropped features
feat_corr_bool.index[feat_corr_bool].tolist(), feat_corr

(['Id', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal'],
 Id              -0.021917
 MSSubClass      -0.084284
 LotFrontage      0.351799
 LotArea          0.263843
 OverallQual      0.790982
 OverallCond     -0.077856
 YearBuilt        0.522897
 YearRemodAdd     0.507101
 MasVnrArea       0.477493
 BsmtFinSF1       0.386420
 BsmtFinSF2      -0.011378
 BsmtUnfSF        0.214479
 TotalBsmtSF      0.613581
 1stFlrSF         0.605852
 2ndFlrSF         0.319334
 LowQualFinSF    -0.025606
 GrLivArea        0.708624
 BsmtFullBath     0.227122
 BsmtHalfBath    -0.016844
 FullBath         0.560664
 HalfBath         0.284108
 BedroomAbvGr     0.168213
 KitchenAbvGr    -0.135907
 TotRmsAbvGrd     0.533723
 Fireplaces       0.466929
 GarageYrBlt      0.486362
 GarageCars       0.640409
 GarageArea       0.623431
 WoodDeckSF       0.324413
 OpenPorchSF      0.315856
 EnclosedPorch   -0.128578
 3SsnPorch        0.044584
 ScreenPorch      0.111447
 PoolArea         0.092404
 MiscVal         -0.021190
 MoSo

In [3]:
# Hard-code dropped features for clarity
dropped_features = ['Id', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal']

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(dropped_features, axis=1)


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = [
    'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
    'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
    'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr','TotRmsAbvGrd',
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MoSold', 'YrSold',
]
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

categorical_features = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley',
    'LotShape', 'LandContour', 'Utilities', 'LotConfig',
    'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
    'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
    'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
    'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
    'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
    'Fence', 'MiscFeature', 'SaleType', 'SaleCondition',
]
categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features),
    ]
)


In [111]:
X = train_data.drop(['SalePrice'], axis=1)
y = train_data['SalePrice']

In [112]:
import xgboost as xgb

pipeline = Pipeline([
    ('featuredropper', FeatureDropper()),
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor()),
])

pipeline.fit(X, y)

In [114]:
from sklearn.model_selection import GridSearchCV

N_ESTIMATORS_OPTIONS = [100, 200, 500, 1000, 1500, 2000]
MAX_DEPTH_OPTIONS = range(2, 10, 1)
LEARNING_RATE_OPTIONS = [0.01, 0.02, 0.05, 0.1, 0.2]

param_grid = [
    {
        'regressor__n_estimators': N_ESTIMATORS_OPTIONS,
        'regressor__max_depth': MAX_DEPTH_OPTIONS,
        'regressor__learning_rate': LEARNING_RATE_OPTIONS,
    },
]

grid = GridSearchCV(pipeline, n_jobs=1, param_grid=param_grid, cv=3)
grid.fit(X, y)
grid.best_score_

np.float64(0.8909337719281515)

In [115]:
test_data = pd.read_csv('datasets/test.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [116]:
predictions = grid.predict(test_data)

predictions_df = pd.Series(predictions, name='SalePrice')
ids_df = test_data['Id']
result_df = pd.concat([ids_df, predictions_df], axis=1)

result_df.to_csv('predictions.csv', index=False)