# 2. Feature Engineering

In this notebook, we will perform feature engineering on the raw housing data. This includes handling categorical variables, dealing with skewed data, removing outliers, and scaling the features. The goal is to prepare the data for model training.

In [1]:
import os
import pandas as pd
import numpy as np

## Data Loading

In [2]:
input_data_dir = os.path.join("..", "data", "raw")
df = pd.read_csv(os.path.join(input_data_dir, "Housing.csv"))
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## Data Preprocessing

### Handling Binary Categorical Features

In [3]:
binary_columns = [
    'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea'
]

# Convert binary categorical features to numeric
for col in binary_columns:
    df[col] = df[col].map({'yes': 1, 'no': 0})

print("DataFrame after converting binary columns:")
df.head()

DataFrame after converting binary columns:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


### Handling Multi-Level Categorical Features

In [4]:
# One-hot encode the furnishingstatus column
dummies = pd.get_dummies(df['furnishingstatus'], drop_first=True)

df = pd.concat([df, dummies], axis=1)
df = df.drop("furnishingstatus", axis=1)

print("DataFrame after optimal One-Hot Encoding:")
df.head()

DataFrame after optimal One-Hot Encoding:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,False,False


### Handling Skewness in Numerical Features

In [5]:
# Log transform skewed numerical features to make them more normally distributed
df['price'] = np.log(df['price'])
df['area'] = np.log(df['area'])

### Handling Outliers

In [6]:
# Remove outliers based on the 99th percentile of the area
q99 = df['area'].quantile(0.99)
print(f"Original number of houses: {len(df)}")

df = df[df['area'] < q99]
print(f"Number of houses after removing outliers: {len(df)}")

Original number of houses: 545
Number of houses after removing outliers: 539


## Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler

# Scale the numerical features (excluding the target variable 'price')
numeric_vars_X = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

scaler = StandardScaler()

df[numeric_vars_X] = scaler.fit_transform(df[numeric_vars_X])

print("DataFrame after scaling:")
df.head()

DataFrame after scaling:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,16.403275,1.193079,1.410843,1.432564,1.369077,1,0,0,0,1,1.527151,1,False,False
1,16.321036,1.685023,1.410843,5.443742,2.520298,1,0,0,0,1,2.691415,0,False,False
2,16.321036,1.961022,0.050477,1.432564,0.217856,1,0,1,0,0,1.527151,1,True,False
3,16.318175,1.221053,1.410843,1.432564,0.217856,1,0,1,0,1,2.691415,1,False,False
4,16.250001,1.193079,1.410843,-0.573025,0.217856,1,1,1,0,1,1.527151,0,False,False


## Saving the Processed Data

In [8]:
output_data_dir = os.path.join("..", "data", "interim")
df.to_csv(os.path.join(output_data_dir, "feature_engineered.csv"), index=False)