# House Prices â€” **LightGBM Only** (Patched)

**With Outlier Removal + Log Features + Defensive Target Mean Encoding + Diagnostics**

Run all cells on Kaggle or locally (place `train.csv` / `test.csv` in the working dir).

In [1]:
# %% [imports]
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42
N_SPLITS = 10
np.random.seed(RANDOM_STATE)

# RMSE helper (compatible with older sklearn)
def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))


In [2]:
# %% [load data]


train_path =  'train.csv'
test_path  = 'test.csv'

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
print('Raw shapes:', train.shape, test.shape)


Raw shapes: (1460, 81) (1459, 80)


In [3]:
# %% [outliers + target]
# Remove classic competition outliers
outlier_idx = train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index
if len(outlier_idx) > 0:
    print('Dropping outliers:', len(outlier_idx))
    train = train.drop(outlier_idx).reset_index(drop=True)
else:
    print('No classic outliers found to drop.')

# Target in log space
y = np.log1p(train['SalePrice'])
X = train.drop(columns=['SalePrice']).copy()
X_test = test.copy()


Dropping outliers: 2


In [4]:
# %% [combine]
full = pd.concat([X, X_test], axis=0, ignore_index=True)
print('Combined shape:', full.shape)


Combined shape: (2917, 80)


In [5]:
# %% [preprocess & features]
# 1) Domain-aware fills
none_fill = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
             'FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
             'PoolQC','Fence','MiscFeature']
for col in none_fill:
    if col in full.columns:
        full[col] = full[col].fillna('None')

zero_fill = ['MasVnrArea','BsmtFullBath','BsmtHalfBath','GarageCars','GarageArea',
             'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
for col in zero_fill:
    if col in full.columns:
        full[col] = full[col].fillna(0)

if 'GarageYrBlt' in full.columns and 'YearBuilt' in full.columns:
    full['GarageYrBlt'] = full['GarageYrBlt'].fillna(full['YearBuilt'])

# 2) Ordinal mappings for quality-like columns
ordinal_maps = {
    'ExterQual': ['Po','Fa','TA','Gd','Ex'],
    'ExterCond': ['Po','Fa','TA','Gd','Ex'],
    'BsmtQual': ['None','Po','Fa','TA','Gd','Ex'],
    'BsmtCond': ['None','Po','Fa','TA','Gd','Ex'],
    'HeatingQC': ['Po','Fa','TA','Gd','Ex'],
    'KitchenQual': ['Po','Fa','TA','Gd','Ex'],
    'FireplaceQu': ['None','Po','Fa','TA','Gd','Ex'],
    'GarageQual': ['None','Po','Fa','TA','Gd','Ex'],
    'GarageCond': ['None','Po','Fa','TA','Gd','Ex']
}
for col, order in ordinal_maps.items():
    if col in full.columns:
        cat_type = pd.api.types.CategoricalDtype(categories=order, ordered=True)
        full[col] = full[col].astype(cat_type).cat.codes.replace(-1, np.nan)

# 3) Rare label grouping for object categoricals
cat_obj_cols = full.select_dtypes(include=['object']).columns.tolist()
for col in cat_obj_cols:
    freq = full[col].value_counts(normalize=True)
    rare = freq[freq < 0.01].index
    if len(rare) > 0:
        full[col] = full[col].replace(list(rare), 'Rare')

# 4) Feature engineering

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Composite
    df['TotalBath'] = (df.get('FullBath',0) + 0.5*df.get('HalfBath',0) +
                       df.get('BsmtFullBath',0) + 0.5*df.get('BsmtHalfBath',0))
    df['TotalSF'] = df.get('GrLivArea',0) + df.get('TotalBsmtSF',0)
    df['PorchSF'] = (df.get('OpenPorchSF',0) + df.get('EnclosedPorch',0) +
                     df.get('3SsnPorch',0) + df.get('ScreenPorch',0))
    # Ages
    df['AgeAtSale'] = df.get('YrSold',0) - df.get('YearBuilt',0)
    df['RemodAgeAtSale'] = df.get('YrSold',0) - df.get('YearRemodAdd',0)
    df['GarageAge'] = df.get('YrSold',0) - df.get('GarageYrBlt',0)
    # Interaction
    if 'OverallQual' in df.columns and 'OverallCond' in df.columns:
        df['QualCond'] = df['OverallQual'] * df['OverallCond']
    # Log1p transforms as additional features
    for c in ['LotArea','TotalBsmtSF','GrLivArea','1stFlrSF','GarageArea']:
        if c in df.columns:
            df['log1p_' + c] = np.log1p(df[c].clip(lower=0))
    return df

full = make_features(full)
print('After FE shape:', full.shape)


After FE shape: (2917, 92)


In [6]:
# %% [split & dtypes]
X_train = full.iloc[:len(X), :].copy()
X_test  = full.iloc[len(X):, :].copy()

# LightGBM can consume pandas 'category' for remaining objects
obj_cols_train = X_train.select_dtypes(include=['object']).columns
obj_cols_test  = X_test.select_dtypes(include=['object']).columns
X_train[obj_cols_train] = X_train[obj_cols_train].astype('category')
X_test[obj_cols_test]   = X_test[obj_cols_test].astype('category')

print('Train features:', X_train.shape, 'Test features:', X_test.shape)


Train features: (1458, 92) Test features: (1459, 92)


In [7]:
# %% [defensive target mean encoder]
class TargetMeanEncoder:
    """
    Fold-safe target mean encoder with additive smoothing and defensive casting.
    - Casts columns to strings internally to avoid category/dtype mismatches across folds.
    - Unseen categories fallback to global mean.
    """
    def __init__(self, cols, k: float = 10.0, verbose: bool = True):
        self.cols = list(cols) if cols is not None else []
        self.k = float(k)
        self.verbose = verbose
        self.global_mean_ = None
        self.maps_ = {}

    def _to_object(self, df, col):
        s = df[col]
        return s.astype(str).fillna('___NA___')

    def fit(self, X: pd.DataFrame, y: np.ndarray):
        if len(self.cols) == 0:
            return self
        df = X.copy()
        df['__target__'] = y
        self.global_mean_ = float(df['__target__'].mean())
        self.maps_.clear()
        for col in self.cols:
            if col not in df.columns:
                if self.verbose:
                    print(f"[TME] WARN: Column '{col}' missing during fit; skipping.")
                continue
            s = self._to_object(df, col)
            stats = pd.DataFrame({'key': s, 'target': df['__target__']})
            grp = stats.groupby('key')['target'].agg(['mean','count'])
            grp['tmean'] = (grp['mean'] * grp['count'] + self.k * self.global_mean_) / (grp['count'] + self.k)
            self.maps_[col] = grp['tmean'].to_dict()
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df = X.copy()
        for col in self.cols:
            if col not in df.columns:
                if self.verbose:
                    print(f"[TME] WARN: Column '{col}' missing during transform; skipping.")
                continue
            s = self._to_object(df, col)
            m = self.maps_.get(col, {})
            encoded = s.map(m).fillna(self.global_mean_)
            df[col + '_TME'] = encoded.astype('float64')
        return df


In [8]:
# %% [cv & training]
params = dict(
    n_estimators=9000,
    learning_rate=0.02,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=RANDOM_STATE
)

# High-cardinality candidates
high_card_cols = [c for c in ['Neighborhood','Exterior1st','Exterior2nd','Condition1','Condition2','SaleType'] if c in X_train.columns]
print('TME columns candidate:', high_card_cols)
print('Dtypes for TME cols in X_train:\n', X_train[high_card_cols].dtypes)

# CV setup
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

oof = np.zeros(len(X_train), dtype=np.float64)
ptest_log = np.zeros(len(X_test), dtype=np.float64)

fold = 0
for trn_idx, val_idx in kf.split(X_train):
    fold += 1
    X_tr, X_val = X_train.iloc[trn_idx].copy(), X_train.iloc[val_idx].copy()
    y_tr, y_val = y[trn_idx], y[val_idx]

    # Diagnostics before encoding
    print(f"[Fold {fold}] Checking TME columns...")
    for c in high_card_cols:
        present_tr = c in X_tr.columns
        present_val = c in X_val.columns
        present_tst = c in X_test.columns
        dtype_tr = X_tr[c].dtype if present_tr else None
        dtype_val = X_val[c].dtype if present_val else None
        dtype_tst = X_test[c].dtype if present_tst else None
        print(f"  {c}: train_present={present_tr}, val_present={present_val}, test_present={present_tst}, dtypes=({dtype_tr}, {dtype_val}, {dtype_tst})")

    # Fit & transform TME
    enc = TargetMeanEncoder(cols=high_card_cols, k=10.0, verbose=True)
    enc.fit(X_tr, y_tr)
    X_tr  = enc.transform(X_tr)
    X_val = enc.transform(X_val)
    X_tst = enc.transform(X_test.copy())

    # Per-fold numeric imputation
    num_cols_tr = X_tr.select_dtypes(include=['int64','float64']).columns
    trn_medians = X_tr[num_cols_tr].median()
    for df_ in (X_tr, X_val, X_tst):
        cols_ = df_.select_dtypes(include=['int64','float64']).columns
        df_[cols_] = df_[cols_].fillna(trn_medians)

    # Ensure remaining objects convert to category for LGB
    for df_ in (X_tr, X_val, X_tst):
        obj_cols_ = df_.select_dtypes(include=['object']).columns
        df_[obj_cols_] = df_[obj_cols_].astype('category')

    # Train
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=300)]
    )

    # Predict
    oof[val_idx] = model.predict(X_val)
    ptest_log += model.predict(X_tst) / N_SPLITS

    print('Fold {} RMSE(log): {:.5f}'.format(fold, rmse(y_val, oof[val_idx])))

print('OOF RMSE(log): {:.5f}'.format(rmse(y, oof)))


TME columns candidate: ['Neighborhood', 'Exterior1st', 'Exterior2nd', 'Condition1', 'Condition2', 'SaleType']
Dtypes for TME cols in X_train:
 Neighborhood    category
Exterior1st     category
Exterior2nd     category
Condition1      category
Condition2      category
SaleType        category
dtype: object
[Fold 1] Checking TME columns...
  Neighborhood: train_present=True, val_present=True, test_present=True, dtypes=(category, category, category)
  Exterior1st: train_present=True, val_present=True, test_present=True, dtypes=(category, category, category)
  Exterior2nd: train_present=True, val_present=True, test_present=True, dtypes=(category, category, category)
  Condition1: train_present=True, val_present=True, test_present=True, dtypes=(category, category, category)
  Condition2: train_present=True, val_present=True, test_present=True, dtypes=(category, category, category)
  SaleType: train_present=True, val_present=True, test_present=True, dtypes=(category, category, category)
[Lig

In [9]:
# %% [submission]
preds = np.expm1(ptest_log)

submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': preds
})

# Health checks
assert submission.shape[0] == 1459, 'Unexpected submission length'
assert submission['SalePrice'].isna().sum() == 0, 'NaNs present in SalePrice'
print('Submission stats:\n', submission['SalePrice'].describe())

submission.to_csv('submission.csv', index=False)
print('Saved submission.csv')


Submission stats:
 count      1459.000000
mean     177338.306779
std       77601.577762
min       52980.010609
25%      127259.752588
50%      156176.547385
75%      206257.052697
max      545325.074315
Name: SalePrice, dtype: float64
Saved submission.csv


### Notes
- Defensive TME casts columns to strings internally, avoiding category/dtype mismatches across folds.
- Unseen categories fall back to the global mean.
- Watch the printed diagnostics if a column goes missing or changes dtype.
- If OOF improves, public RMSLE usually follows; consider lowering `learning_rate` to 0.01 with more estimators for extra stability.
