# Notebook 04: Feature Engineering

Create new features, encode categorical variables, scale numerical features.
Prepare data for model training.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

ames_clean = pd.read_csv('data/processed/ames_cleaned.csv')
malaysia_clean = pd.read_csv('data/processed/malaysia_cleaned.csv')

print("Cleaned data loaded for feature engineering")
print(f"Ames: {ames_clean.shape} | Malaysia: {malaysia_clean.shape}")

Cleaned data loaded for feature engineering
Ames: (2793, 82) | Malaysia: (1877, 8)


## Ames Feature Engineering

In [2]:
# Create a copy
ames_fe = ames_clean.copy()

# Feature 1: Total Area
ames_fe['Total Area'] = ames_fe['Total Bsmt SF'] + ames_fe['Gr Liv Area']

# Feature 2: Living Area to Lot Area Ratio
ames_fe['LivingArea_LotArea_Ratio'] = ames_fe['Gr Liv Area'] / (ames_fe['Lot Area'] + 1)

# Feature 3: Garage Area per Car
ames_fe['GarageArea_per_Car'] = ames_fe['Garage Area'] / (ames_fe['Garage Cars'] + 1)

# Feature 4: House Age
ames_fe['HouseAge'] = 2024 - ames_fe['Year Built']

# Feature 5: Renovation Indicator
ames_fe['WasRenovated'] = (ames_fe['Year Remod/Add'] > ames_fe['Year Built']).astype(int)

# Feature 6: Has Fireplace
ames_fe['HasFireplace'] = (ames_fe['Fireplaces'] > 0).astype(int)

print(f"Created {6} new features for Ames")
print(f"New shape: {ames_fe.shape}")

Created 6 new features for Ames
New shape: (2793, 88)


In [3]:
ames_fe.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

## Malaysia Feature Engineering

In [4]:
# Create a copy
malaysia_fe = malaysia_clean.copy()
malaysia_fe.columns

Index(['Township', 'Area', 'State', 'Tenure', 'Type', 'Median_Price',
       'Median_PSF', 'Transactions'],
      dtype='object')

The cell under md creates too many "hints" for future models, that affects results dramatically.\
This way, models do not look for relations in data, but will just calculate the key variable by multiplying `MedianPSF` by `Type`.

In [5]:
'''
# Feature 1: Log Price
malaysia_fe['LogPrice'] = np.log1p(malaysia_fe['Median_Price'])

# Feature 2: Log PSF
if 'MedianPSF' in malaysia_fe.columns:
    malaysia_fe['LogPSF'] = np.log1p(malaysia_fe['MedianPSF'])

# Feature 3: Price per unit area approximation
if 'MedianPSF' in malaysia_fe.columns:
    malaysia_fe['PricePerArea'] = malaysia_fe['Median_Price'] / (malaysia_fe['MedianPSF'] + 1)

print(f"Created new features for Malaysia")
print(f"New shape: {malaysia_fe.shape}")
'''

'\n# Feature 1: Log Price\nmalaysia_fe[\'LogPrice\'] = np.log1p(malaysia_fe[\'Median_Price\'])\n\n# Feature 2: Log PSF\nif \'MedianPSF\' in malaysia_fe.columns:\n    malaysia_fe[\'LogPSF\'] = np.log1p(malaysia_fe[\'MedianPSF\'])\n\n# Feature 3: Price per unit area approximation\nif \'MedianPSF\' in malaysia_fe.columns:\n    malaysia_fe[\'PricePerArea\'] = malaysia_fe[\'Median_Price\'] / (malaysia_fe[\'MedianPSF\'] + 1)\n\nprint(f"Created new features for Malaysia")\nprint(f"New shape: {malaysia_fe.shape}")\n'

**Note**: if in future will be required to get from this dataset as highest score as possible, just uncomment section above. The RÂ² Score will be above 95%.

## Encode Categorical Variables - Ames

In [6]:
# Convert categorical to numeric using pd.get_dummies
categorical_cols_ames = ames_fe.select_dtypes(include=['object', 'category']).columns

print(f"Categorical columns in Ames: {len(categorical_cols_ames)}")
print(f"Examples: {list(categorical_cols_ames[:5])}")

# One-hot encoding
ames_encoded = pd.get_dummies(ames_fe, columns=categorical_cols_ames, drop_first=True)

print(f"\nAfter encoding:")
print(f"  Shape: {ames_encoded.shape}")
print(f"  All numeric: {ames_encoded.select_dtypes(include=['object']).shape == 0}")

Categorical columns in Ames: 43
Examples: ['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour']

After encoding:
  Shape: (2793, 269)
  All numeric: False


## Encode Categorical Variables - Malaysia

In [7]:
# Convert categorical to numeric
categorical_cols_malaysia = malaysia_fe.select_dtypes(include=['object', 'category']).columns

if len(categorical_cols_malaysia) > 0:
    print(f"Categorical columns in Malaysia: {list(categorical_cols_malaysia)}")
    
    # One-hot encoding
    malaysia_encoded = pd.get_dummies(malaysia_fe, columns=categorical_cols_malaysia, drop_first=True)
    
    print(f"\nAfter encoding:")
    print(f"  Shape: {malaysia_encoded.shape}")
else:
    malaysia_encoded = malaysia_fe.copy()
    print("No categorical columns to encode")

Categorical columns in Malaysia: ['Township', 'Area', 'State', 'Tenure', 'Type']

After encoding:
  Shape: (1877, 2181)


## Scale Numerical Features

In [8]:
# Prepare Ames for scaling (exclude target)
X_ames = ames_encoded.drop(['SalePrice', 'Id'], axis=1, errors='ignore')
y_ames = ames_encoded['SalePrice'] if 'SalePrice' in ames_encoded.columns else None

# Scale
scaler_ames = StandardScaler()
X_ames_scaled = scaler_ames.fit_transform(X_ames)
X_ames_scaled_df = pd.DataFrame(X_ames_scaled, columns=X_ames.columns)

print(f"Ames features scaled:")
print(f"  Original range: [{X_ames.values.min():.2f}, {X_ames.values.max():.2f}]")
print(f"  Scaled range: [{X_ames_scaled.min():.2f}, {X_ames_scaled.max():.2f}]")

# Prepare Malaysia for scaling
X_malaysia = malaysia_encoded.drop(['Median_Price', 'Is_Outlier'], axis=1, errors='ignore')
y_malaysia = malaysia_encoded['Median_Price'] if 'Median_Price' in malaysia_encoded.columns else None

# Scale
scaler_malaysia = StandardScaler()
X_malaysia_scaled = scaler_malaysia.fit_transform(X_malaysia)
X_malaysia_scaled_df = pd.DataFrame(X_malaysia_scaled, columns=X_malaysia.columns)

print(f"\nMalaysia features scaled:")
print(f"  Original range: [{X_malaysia.values.min():.2f}, {X_malaysia.values.max():.2f}]")
print(f"  Scaled range: [{X_malaysia_scaled.min():.2f}, {X_malaysia_scaled.max():.2f}]")

Ames features scaled:
  Original range: [0.00, 1007100110.00]
  Scaled range: [-15.22, 52.84]

Malaysia features scaled:
  Original range: [0.00, 1200.00]
  Scaled range: [-1.91, 43.31]


## Save Engineered Features

In [None]:
# Save scaled features
X_ames_scaled_df.to_csv('data/processed/ames_X_scaled.csv', index=False)
X_malaysia_scaled_df.to_csv('data/processed/malaysia_X_scaled.csv', index=False)

# Save targets
if y_ames is not None:
    y_ames.to_csv('data/processed/ames_y.csv', index=False, header=['SalePrice'])
if y_malaysia is not None:
    y_malaysia.to_csv('data/processed/malaysia_y.csv', index=False, header=['Median_Price'])

print("Engineered features saved:")
print(f"  - ames_X_scaled.csv ({X_ames_scaled_df.shape})")
print(f"  - ames_y.csv")
print(f"  - malaysia_X_scaled.csv ({X_malaysia_scaled_df.shape})")
print(f"  - malaysia_y.csv")

Engineered features saved:
  - ames_X_scaled.csv ((2793, 268))
  - ames_y.csv
  - malaysia_X_scaled.csv ((1877, 2180))
  - malaysia_y.csv


## Feature Engineering Summary

In [10]:
print("FEATURE ENGINEERING SUMMARY")

print(f"\nAmes Dataset:")
print(f"  Original features: {ames_clean.shape}")
print(f"  After engineering: {ames_fe.shape}")
print(f"  After encoding: {ames_encoded.shape}")
print(f"  New engineered features: TotalArea, LivingArea_LotArea_Ratio,")
print(f"                          GarageArea_per_Car, HouseAge, WasRenovated, HasFireplace")

print(f"\nMalaysia Dataset:")
print(f"  Original features: {malaysia_clean.shape}")
print(f"  After engineering: {malaysia_fe.shape}")
print(f"  After encoding: {malaysia_encoded.shape}")

FEATURE ENGINEERING SUMMARY

Ames Dataset:
  Original features: (2793, 82)
  After engineering: (2793, 88)
  After encoding: (2793, 269)
  New engineered features: TotalArea, LivingArea_LotArea_Ratio,
                          GarageArea_per_Car, HouseAge, WasRenovated, HasFireplace

Malaysia Dataset:
  Original features: (1877, 8)
  After engineering: (1877, 8)
  After encoding: (1877, 2181)
