# Task 08 â€“ House Prices Feature Engineering & Random Forest

This notebook implements the full pipeline for the Skillytixs Task 08:
- Load data
- Clean & engineer features
- Train a RandomForestRegressor
- Generate a Kaggle-ready submission


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold

RANDOM_STATE = 42


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_id = train["Id"]
test_id = test["Id"]

y = train["SalePrice"]
X = train.drop(["SalePrice"], axis=1)

print(train.shape, test.shape)
train.head()


In [None]:
def combine_train_test(X, test):
    all_data = pd.concat([X, test], axis=0, sort=False).reset_index(drop=True)
    return all_data

def fix_some_nas_with_domain_logic(all_data):
    none_cols = [
        "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
        "GarageType", "GarageFinish", "GarageQual", "GarageCond",
        "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
        "MasVnrType"
    ]
    for col in none_cols:
        if col in all_data.columns:
            all_data[col] = all_data[col].fillna("None")
    if "GarageYrBlt" in all_data.columns:
        all_data["GarageYrBlt"] = all_data["GarageYrBlt"].fillna(all_data["YearBuilt"])
    if "MasVnrArea" in all_data.columns:
        all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
    return all_data

def handle_missing_values(all_data):
    num_cols = all_data.select_dtypes(include=[np.number]).columns
    cat_cols = all_data.select_dtypes(exclude=[np.number]).columns
    for col in num_cols:
        all_data[col] = all_data[col].fillna(all_data[col].median())
    for col in cat_cols:
        all_data[col] = all_data[col].fillna("Missing")
    return all_data, num_cols, cat_cols

def apply_log_transform_to_skewed(all_data, num_cols, skew_thresh=0.75):
    skewness = all_data[num_cols].drop(columns=["Id"], errors="ignore").skew()
    skewed_features = skewness[skewness > skew_thresh].index.tolist()
    print("Skewed numeric features:", skewed_features)
    for col in skewed_features:
        min_val = all_data[col].min()
        if min_val < 0:
            all_data[col] = all_data[col] + abs(min_val)
        all_data[col] = np.log1p(all_data[col])
    return all_data

def encode_categoricals(all_data, cat_cols):
    all_data_encoded = pd.get_dummies(all_data, columns=cat_cols, drop_first=True)
    print("Shape after one-hot encoding:", all_data_encoded.shape)
    return all_data_encoded


In [None]:
all_data = combine_train_test(X, test)
print("Combined shape:", all_data.shape)

all_data = fix_some_nas_with_domain_logic(all_data)

all_data, num_cols, cat_cols = handle_missing_values(all_data)

all_data = apply_log_transform_to_skewed(all_data, num_cols)

all_data_encoded = encode_categoricals(all_data, cat_cols)

n_train = X.shape[0]
X_train = all_data_encoded.iloc[:n_train, :].copy()
X_test = all_data_encoded.iloc[n_train:, :].copy()

y_log = np.log1p(y)

X_train.shape, X_test.shape


In [None]:
rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="auto",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scores = cross_val_score(
    rf,
    X_train,
    y_log,
    scoring="neg_root_mean_squared_error",
    cv=kf,
    n_jobs=-1
)
rmse_scores = -scores
print("RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Std RMSE:", rmse_scores.std())

rf.fit(X_train, y_log)


In [None]:
importances = rf.feature_importances_
features = X_train.columns

feat_imp = pd.Series(importances, index=features).sort_values(ascending=False)
feat_imp.head(20)


In [None]:
y_test_log_pred = rf.predict(X_test)
y_test_pred = np.expm1(y_test_log_pred)

submission = pd.DataFrame({
    "Id": test_id,
    "SalePrice": y_test_pred
})

submission.to_csv("submission.csv", index=False)
submission.head()
