In [32]:
import os
import pandas as pd
import numpy as np

In [33]:
input_data_dir = os.path.join("..", "data", "raw")
df = pd.read_csv(os.path.join(input_data_dir, "Housing.csv"))
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [34]:
binary_columns = [
    'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea'
]

for col in binary_columns:
    df[col] = df[col].map({'yes': 1, 'no': 0})

print("DataFrame after converting binary columns:")
df.head()

DataFrame after converting binary columns:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [35]:
# Select relevant features
numerical_columns = [
    'price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
]

df['price'] = np.log(df['price'])
df['area'] = np.log(df['area'])

# # Feature Creation : price_per_sqft
df['price_per_sqft'] = df['price'] / df['area']
# df['area_per_bathrooms'] = df['area'] / (df['bathrooms'] + 1e-6)


In [36]:

# 1. Remove outliers
# Find the 99th percentile of the area
q99 = df['area'].quantile(0.99)
print(f"Original number of houses: {len(df)}")

df = df[df['area'] < q99]
print(f"Number of houses after removing outliers: {len(df)}")

Original number of houses: 545
Number of houses after removing outliers: 539


In [37]:
dummies = pd.get_dummies(df['furnishingstatus'], drop_first=True)

df = pd.concat([df, dummies], axis=1)
df = df.drop("furnishingstatus", axis=1)

print("DataFrame after optimal One-Hot Encoding:")
df.head()

DataFrame after optimal One-Hot Encoding:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,price_per_sqft,semi-furnished,unfurnished
0,16.403275,8.911934,4,2,3,1,0,0,0,1,2,1,1.840596,False,False
1,16.321036,9.100526,4,4,4,1,0,0,0,1,3,0,1.793417,False,False
2,16.321036,9.206332,3,2,2,1,0,1,0,0,2,1,1.772805,True,False
3,16.318175,8.922658,4,2,2,1,0,1,0,1,3,1,1.828847,False,False
4,16.250001,8.911934,4,1,2,1,1,1,0,1,2,0,1.823398,False,False


In [38]:
from sklearn.preprocessing import StandardScaler

# Since we have already split y (price), we should not include it in the list for X
numeric_vars_X = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price_per_sqft']

scaler = StandardScaler()

df[numeric_vars_X] = scaler.fit_transform(df[numeric_vars_X])

print("X_train after scaling:")
df

X_train after scaling:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,price_per_sqft,semi-furnished,unfurnished
0,2.976180,1.193079,1.410843,1.432564,1.369077,1,0,0,0,1,1.527151,1,0.415930,False,False
1,2.753754,1.685023,1.410843,5.443742,2.520298,1,0,0,0,1,2.691415,0,-0.285074,False,False
2,2.753754,1.961022,0.050477,1.432564,0.217856,1,0,1,0,0,1.527151,1,-0.591323,True,False
3,2.746016,1.221053,1.410843,1.432564,0.217856,1,0,1,0,1,2.691415,1,0.241352,False,False
4,2.561627,1.193079,1.410843,-0.573025,0.217856,1,1,1,0,1,1.527151,0,0.160388,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,-2.403192,-1.169109,-1.309888,-0.573025,-0.933364,1,0,1,0,0,1.527151,0,-0.181906,False,True
541,-2.482894,-1.751183,0.050477,-0.573025,-0.933364,0,0,0,0,0,-0.801376,0,0.528760,True,False
542,-2.509271,-0.679068,-1.309888,-0.573025,-0.933364,1,0,0,0,0,-0.801376,0,-0.866300,False,True
543,-2.509271,-1.248562,0.050477,-0.573025,-0.933364,0,0,0,0,0,-0.801376,0,-0.152814,False,False


In [39]:
output_data_dir = os.path.join("..", "data", "interim")
df.to_csv(os.path.join(output_data_dir, "feature_engineered.csv"), index=False)