In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
from sklearn import set_config
set_config(display='diagram')

In [5]:
train = pd.read_csv('../dataset/train_unprocessed.csv')
test = pd.read_csv('../dataset/test.csv')

In [6]:
X_train = train.drop(columns=['Id', 'SalePrice'])
Y_train = train['SalePrice']
X_test = test.drop(columns=['Id'])

In [7]:
X_train = X_train.drop(columns=['Street', 'LandContour', 'Utilities', 'LandSlope', 'Condition1', 'Condition2', 'RoofMatl', 'MasVnrArea', 'BsmtCond', 'BsmtFinType2', 'BsmtFinSF2', 'Heating', 'Electrical', '2ndFlrSF', 'LowQualFinSF', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'MiscVal', 'SaleType', 'SaleCondition', 'MiscFeature', 'MSSubClass', 'KitchenAbvGr', 'MoSold', 'YrSold', 'LotConfig', 'BldgType'])

In [8]:
X_train.head()

Unnamed: 0,MSZoning,LotFrontage,LotArea,Alley,LotShape,Neighborhood,HouseStyle,OverallQual,OverallCond,YearBuilt,...,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,Fence
0,RL,65.0,8450,,Reg,CollgCr,2Story,7,5,2003,...,3,Gd,8,0,,Attchd,RFn,2,548,
1,RL,80.0,9600,,Reg,Veenker,1Story,6,8,1976,...,3,TA,6,1,TA,Attchd,RFn,2,460,
2,RL,68.0,11250,,IR1,CollgCr,2Story,7,5,2001,...,3,Gd,6,1,TA,Attchd,RFn,2,608,
3,RL,60.0,9550,,IR1,Crawfor,2Story,7,5,1915,...,3,Gd,7,1,Gd,Detchd,Unf,3,642,
4,RL,84.0,14260,,IR1,NoRidge,2Story,8,5,2000,...,4,Gd,9,1,TA,Attchd,RFn,3,836,


In [9]:
Y_train.sample(5)

266     185000
478     297000
1002    232000
484     132500
406     115000
Name: SalePrice, dtype: int64

In [10]:
def replace_rare_categories(df):
    df = df.copy()

    df.iloc[:, 0] = np.where(df.iloc[:, 0].isin(['FV', 'RH', 'C (all)']), 'Others', df.iloc[:, 0])  
    df.iloc[:, 3] = np.where(df.iloc[:, 3].isna() | (df.iloc[:, 3] == ''), 'None', df.iloc[:, 3])
    df.iloc[:, 3] = np.where(df.iloc[:, 3].isin(['Grvl', 'Pave']), 'Others', df.iloc[:, 3])  
    df.iloc[:, 6] = np.where(df.iloc[:, 6].isin(['SLvl', 'SFoyer', '1.5Unf', '2.5Unf', '2.5Fin']), 'Others', df.iloc[:, 6])  
    df.iloc[:, 11] = np.where(df.iloc[:, 11].isin(['Flat', 'Gambrel', 'Mansard']), 'Others', df.iloc[:, 11])  
    df.iloc[:, 12] = np.where(df.iloc[:, 12].isin(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'ImStucc', 'Other', 'PreCast', 'Stone', 'Stucco', 'WdShing']), 'Others', df.iloc[:, 12])  
    df.iloc[:, 13] = np.where(df.iloc[:, 13].isin(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'ImStucc', 'Other', 'PreCast', 'Stone', 'Stucco', 'WdShing']), 'Others', df.iloc[:, 13])  
    df.iloc[:, 17] = np.where(df.iloc[:, 17].isin(['Slab', 'Stone']), 'Others', df.iloc[:, 17])  
    df.iloc[:, 36] = np.where(df.iloc[:, 36].isna() | (df.iloc[:, 36] == ''), 'None', df.iloc[:, 36]) 
    df.iloc[:, 36] = np.where(df.iloc[:, 36].isin(['Basement', 'CarPort']), 'Others', df.iloc[:, 36])  

    return df


In [11]:
X_train = replace_rare_categories(X_train)
X_test = replace_rare_categories(X_test)

In [12]:
imputer_trf = ColumnTransformer(transformers=[
    ('fill_zero', SimpleImputer(strategy='constant', fill_value=0), ['LotFrontage']),
    ('fill_none', SimpleImputer(strategy='constant', fill_value='None'), [3, 14, 18, 19, 20, 35, 36, 37, 40])
], remainder='passthrough')

In [13]:
encoder_trf = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=[
            ['IR3', 'IR2', 'IR1', 'Reg'],  # LotShape
            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterQual
            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterCond
            ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtQual
            ['No', 'Mn', 'Av', 'Gd'],  # BsmtExposure
            ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'],  # BsmtFinType1
            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # HeatingQC
            ['N', 'Y'],  # CentralAir
            ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # KitchenQual
            ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # FireplaceQu
            ['None', 'Unf', 'RFn', 'Fin']  # GarageFinish
        ], dtype=int, handle_unknown='use_encoded_value', unknown_value=-1), [
            12, 22, 23, 3, 4, 5, 28, 29, 36, 6, 8
        ]),
        
        ('onehot', OneHotEncoder(drop='first', sparse=False, dtype=int, handle_unknown='ignore'), [
            10, 1, 13, 14, 19, 20, 21, 2, 24, 7, 5, 9
        ])
    ],
    remainder='passthrough'
)

In [14]:
def cap_outliers(X):
    X = X.copy()
    for i in range(X.shape[1]):
        col = X[:, i]
        percentile25 = np.percentile(col, 25)
        percentile75 = np.percentile(col, 75)
        iqr = percentile75 - percentile25

        upper_limit = percentile75 + 2 * iqr
        lower_limit = percentile25 - 2 * iqr

        col = np.where(col > upper_limit, upper_limit,
                       np.where(col < lower_limit, lower_limit, col))
        X[:, i] = col
    return X

cap_outliers_trf = FunctionTransformer(cap_outliers, validate=False)


In [15]:
outlier_trf = ColumnTransformer(
    transformers=[
        ('cap_outliers', cap_outliers_trf, [80, 86, 87, 89, 98])
    ],
    remainder='passthrough'
)

In [16]:
preprocess_trf = Pipeline(steps=[
    ('imputer', imputer_trf),
    ('encoder', encoder_trf),
    ('outlier', outlier_trf)
])

In [17]:
standardize_trf = ColumnTransformer([
    ('StandardScale', StandardScaler(), slice(0, 99))
])

In [18]:
yj_trf = ColumnTransformer([
    ('yeo_johnson', PowerTransformer(), slice(0, 99))
], remainder='passthrough')


In [25]:
pca_trf = PCA(n_components=55)

In [20]:
model_trf = GradientBoostingRegressor()

In [21]:
pipe = Pipeline([
    ('Preprocessor', preprocess_trf),
    ('Standardizer', standardize_trf),
    ('Yeo-Johnson', yj_trf),
    ('PCA', pca_trf),
    ('Model', model_trf)
])

In [22]:
pipe.fit(X_train, Y_train)



In [23]:
Y_pred = pipe.predict(X_test)

In [26]:
import pickle
pickle.dump(pipe,open('../pipeline/pipe.pkl','wb'))

In [24]:
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': Y_pred
})

submission.to_csv('../submissions/submission.csv', index=False)