In [3]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor as xgbr
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer

import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


.\house-prices-updated.ipynb
.\test.csv
.\train.csv


In [5]:
train_path = 'train.csv'
train_data = pd.read_csv(train_path, index_col='Id')

In [None]:
test_path = 'test.csv'
test_data = pd.read_csv(test_path, index_col='Id')
X_test = test_data.copy()

In [6]:
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
train_data.columns

In [None]:
y = train_data.SalePrice
train_data.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_data, y, 
                                                train_size=0.8, test_size=0.2,
                                                random_state=0)

In [None]:
X_test.head()

In [None]:
#Target Encoder for Cat Features
#Option 1: encode all cat features
import category_encoders as ce

cat_cols = [cat for cat in X_train.columns if 
           X_train[cat].dtype == "object"]

#target_enc = ce.TargetEncoder(cols=cat_cols)
target_enc = ce.CatBoostEncoder(cols=cat_cols)

target_enc.fit(X_train[cat_cols], y_train)

train_encoded = X_train.join(target_enc.transform(X_train[cat_cols]).add_suffix('_target'))
valid_encoded = X_val.join(target_enc.transform(X_val[cat_cols]).add_suffix('_target'))
test_encoded = X_test.join(target_enc.transform(X_test[cat_cols]).add_suffix('_target'))

**Encoding only low card features and skipping high card features**
#Option 2: encode only low card features

low_card_cat_cols = [cat for cat in X_train.columns if 
           X_train[cat].nunique() < 10 and X_train[cat].dtype == "object"]

high_card_cat_cols = [cat for cat in X_train.columns if 
           X_train[cat].nunique() >= 10 and X_train[cat].dtype == "object"]

num_cols = [cat for cat in X_train.columns if
           X_train[cat].dtype in ['int64', 'float64']]

final_features = low_card_cat_cols + num_cols

X_train = X_train[final_features]
X_val = X_val[final_features]
X_test = test_data[final_features]

**Imputer and Encoder**

imputer = SimpleImputer(strategy='median')

cat_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', imputer, num_cols),
        ('cat', cat_transform, low_card_cat_cols)
    ])

In [None]:
#Imputer for num cols
imputer = SimpleImputer(strategy='median')

num_cols = [cat for cat in train_encoded.columns if train_encoded[cat].dtype in ['int64', 'float64']]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', imputer, num_cols),
    ])

In [None]:
from xgboost import XGBRegressor

min_mae = 20000
best_est = 0

#Find best nr of estimators
for i in range(0,2000,200):
    model = XGBRegressor(random_state= 0, n_estimators=i, learning_rate=0.055)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)
                         ])
    clf.fit(train_encoded, y_train)
    predictions = clf.predict(valid_encoded)
    print('MAE:', mean_absolute_error(y_val, predictions))
    if min_mae >  mean_absolute_error(y_val, predictions):
        min_mae  =  mean_absolute_error(y_val, predictions)
        best_est = i
        
model = XGBRegressor(random_state= 0, n_estimators=best_est, learning_rate=0.055)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)
                         ])

clf.fit(train_encoded, y_train)
predictions = clf.predict(valid_encoded)
print('Best MAE:', mean_absolute_error(y_val, predictions))
print('Best est:', best_est)

In [None]:
preds_test = clf.predict(test_encoded)

# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission_house_prices.csv', index=False)