# Data Preprocessing & Feature Engineering

This notebook implements the final data cleaning and feature engineering pipeline
based on insights obtained during exploratory data analysis (EDA).


In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [25]:
train_df = pd.read_csv("../data/raw/train.csv")
test_df  = pd.read_csv("../data/raw/test.csv")

print(train_df.shape, test_df.shape)

(16209, 21) (5404, 20)


Creating "log_price" column to get a more symmetric distribution of prices of properties

In [26]:
train_df["log_price"] = np.log1p(train_df["price"])

In [27]:
TABULAR_FEATURES = ["bedrooms","bathrooms","sqft_living","sqft_lot","floors","waterfront","view","condition","grade","sqft_living15","lat","long"]

In [28]:
train_medians = train_df[TABULAR_FEATURES].median()
train_df[TABULAR_FEATURES] = train_df[TABULAR_FEATURES].fillna(train_medians)
test_df[TABULAR_FEATURES] = test_df[TABULAR_FEATURES].fillna(train_medians)

In [29]:
train_df["image_path"] = train_df.apply(
    lambda row: f"../data/images/train/{row['id']}_{row.name}.png",
    axis=1
)

test_df["image_path"] = test_df.apply(
    lambda row: f"../data/images/test/{row['id']}_{row.name}.png",
    axis=1
)

In [30]:
X_train_tab = train_df[TABULAR_FEATURES]
y_train = train_df["log_price"]
X_test_tab = test_df[TABULAR_FEATURES]

In [31]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_tab)

X_test_scaled = scaler.transform(X_test_tab)

In [32]:
processed_train = pd.DataFrame(
    X_train_scaled,
    columns=TABULAR_FEATURES,
    index=train_df.index
)

processed_train["log_price"] = y_train
processed_train["image_path"] = train_df["image_path"]

processed_test = pd.DataFrame(
    X_test_scaled,
    columns=TABULAR_FEATURES,
    index=test_df.index
)

processed_test["image_path"] = test_df["image_path"]


In [33]:
processed_train.to_csv("../data/processed/train_tabular.csv", index=False)
processed_test.to_csv("../data/processed/test_tabular.csv", index=False)

In [34]:
processed_train.head(10)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_living15,lat,long,log_price,image_path
0,0.677402,0.178963,-0.290276,-0.144952,0.922943,-0.083788,-0.306964,-0.626,-0.557611,-0.473911,-0.900034,0.192759,12.501142,../data/images/train/9117000170_0.png
1,-0.394132,0.505667,-0.521813,-0.311135,0.922943,-0.083788,-0.306964,0.908842,-0.557611,-0.385919,-1.137139,0.192759,12.409018,../data/images/train/6700390210_1.png
2,0.677402,0.505667,-0.389506,-0.160457,0.922943,-0.083788,-0.306964,-0.626,0.29635,-0.165941,-2.098571,-0.706669,12.206078,../data/images/train/7212660540_2.png
3,-1.465666,0.178963,-0.918734,-0.364787,0.922943,-0.083788,-0.306964,-0.626,-0.557611,-1.089851,-0.206791,1.006527,12.772806,../data/images/train/8562780200_3.png
4,-0.394132,-0.147741,-0.874632,-0.038936,-0.918626,-0.083788,-0.306964,-0.626,-0.557611,-0.576568,-1.367738,0.999388,12.354497,../data/images/train/7760400350_4.png
5,0.677402,1.812483,0.580745,-0.251585,0.922943,-0.083788,-0.306964,-0.626,0.29635,0.024708,0.969336,-1.292011,13.490474,../data/images/train/464001025_5.png
6,-1.465666,-1.454558,-1.117194,-0.197496,-0.918626,-0.083788,-0.306964,0.908842,-1.411573,-0.195271,1.34162,-0.720946,12.611524,../data/images/train/3432500486_6.png
7,-0.394132,-0.147741,0.062543,0.5229,-0.918626,-0.083788,-0.306964,0.908842,0.29635,1.285919,1.360415,0.649611,13.687678,../data/images/train/1126059095_7.png
8,-0.394132,-1.454558,-1.106169,-0.22418,-0.918626,-0.083788,-0.306964,-0.626,-0.557611,-0.972529,-1.612072,-0.549626,12.072547,../data/images/train/3876500290_8.png
9,-0.394132,0.178963,-1.185553,-0.361207,0.922943,-0.083788,-0.306964,-0.626,-0.557611,-1.44475,0.994637,-1.092138,12.676079,../data/images/train/1865400075_9.png


In [35]:
processed_train.isnull().sum().sort_values()

bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_living15    0
lat              0
long             0
log_price        0
image_path       0
dtype: int64