In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Load Data

In [3]:
train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")

print(train.shape, test.shape)


(16209, 21) (5404, 20)


In [4]:
# Feature engineering
train['bed_bath_ratio'] = train['bedrooms'] / (train['bathrooms'] + 1)
test['bed_bath_ratio']  = test['bedrooms'] / (test['bathrooms'] + 1)

train['sqft_per_bed'] = train['sqft_living'] / (train['bedrooms'] + 1)
test['sqft_per_bed']  = test['sqft_living'] / (test['bedrooms'] + 1)

train['lat_long_interaction'] = train['lat'] * train['long']
test['lat_long_interaction']  = test['lat'] * test['long']


In [7]:
FEATURES = [
    'bedrooms', 'bathrooms', 'sqft_living',
    'lat', 'long',
    'bed_bath_ratio',
    'sqft_per_bed',
    'lat_long_interaction'
]

TARGET = 'price'



# Drop rows where TARGET is missing 

In [8]:
for col in FEATURES + [TARGET]:
    if col in train.columns:
        train[col] = pd.to_numeric(train[col], errors='coerce')

for col in FEATURES:
    test[col] = pd.to_numeric(test[col], errors='coerce')

train = train.dropna(subset=[TARGET]).reset_index(drop=True)


# Scale Features

In [9]:
scaler = StandardScaler()

train[FEATURES] = scaler.fit_transform(train[FEATURES])
test[FEATURES]  = scaler.transform(test[FEATURES])


In [11]:
train['price_log'] = np.log1p(train[TARGET])

train[['price']].describe()


Unnamed: 0,price
count,16209.0
mean,537470.3
std,360303.6
min,75000.0
25%,320000.0
50%,450000.0
75%,640000.0
max,7700000.0


In [12]:
train.to_csv("data/train_processed.csv", index=False)
test.to_csv("data/test_processed.csv", index=False)

print("Preprocessing complete.")


Preprocessing complete.
