In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

# Feature Engeneering
Based on the exploration phase, we have identified several features that could be useful for predicting house prices.

In [16]:
# Load the datasets with absolute paths
kc_house_data = pd.read_csv('../data/raw/kc_house_data.csv')
zipcode_demographics = pd.read_csv('../data/raw/zipcode_demographics.csv')

kc_house_data = pd.merge(kc_house_data, zipcode_demographics, on='zipcode', how='left')
kc_house_data["log_price"] = np.log1p(kc_house_data["price"])

# Split data into training and testing sets
kc_house_data, test = train_test_split(kc_house_data, test_size=0.1, random_state=42)

## Feature Engineering

### New Features Created
- `house_age`: Years since construction (2025 - yr_built)
- `yrs_since_renovation`: Years since last renovation, or house age if never renovated
- `is_renovated`: Binary flag for renovation history
- `price_per_sqft`: Price per square foot of living space
- `living_lot_ratio`: Ratio of living area to total lot size

In [17]:
for df in [kc_house_data, test]:
    df['house_age'] = 2025 - df['yr_built']
    df['yrs_since_renovation'] = np.where(df['yr_renovated'] > 0, 2025 - df['yr_renovated'], df['house_age'])
    df['is_renovated'] = np.where(df['yr_renovated'] > 0, 1, 0)
    df['price_per_sqft'] = df['price'] / df['sqft_living']
    df['living_lot_ratio'] = df['sqft_living'] / df['sqft_lot']


### Zipcode Encoding
- Target-encoded zipcodes using mean `log_price` from training data
- Handles new zipcodes in test set with training median

In [18]:
zipcode_means = kc_house_data.groupby('zipcode')['log_price'].mean()

kc_house_data['zipcode_encoded'] = kc_house_data['zipcode'].map(zipcode_means)
test['zipcode_encoded'] = test['zipcode'].map(zipcode_means)



## Log Transformations

Applied `log1p` to normalize skewed financial features:
- `medn_hshld_incm_amt` → `log_medn_income`
- `hous_val_amt` → `log_hous_val`

*Log transform helps handle skew and improve model performance.*

In [19]:
for df in [kc_house_data, test]:
    df['log_medn_income'] = np.log1p(df['medn_hshld_incm_amt'])
    df['log_hous_val'] = np.log1p(df['hous_val_amt'])


### Size Comparisons
- `diff_sqft_living15`: Living area vs neighborhood average  
- `diff_sqft_lot15`: Lot size vs neighborhood average

### Log Transformations  
- `medn_hshld_incm_amt` → `log_medn_income`
- `hous_val_amt` → `log_hous_val`

### Zipcode Encoding
- Target-encoded using mean `log_price` per zipcode
- Created `zipcode_encoded` feature

*All transformations applied to handle skew and capture location/value relationships.*

In [20]:
kc_house_data['diff_sqft_living15'] = kc_house_data['sqft_living'] - kc_house_data['sqft_living15']
kc_house_data['diff_sqft_lot15'] = kc_house_data['sqft_lot'] - kc_house_data['sqft_lot15']


test['diff_sqft_living15'] = test['sqft_living'] - test['sqft_living15']
test['diff_sqft_lot15'] = test['sqft_lot'] - test['sqft_lot15']


# Log-transformar e padronizar valores muito dispersos
kc_house_data['log_medn_income'] = np.log1p(kc_house_data['medn_hshld_incm_amt'])
kc_house_data['log_hous_val'] = np.log1p(kc_house_data['hous_val_amt'])

zipcode_means = kc_house_data.groupby('zipcode')['log_price'].mean()
kc_house_data['zipcode_encoded'] = kc_house_data['zipcode'].map(zipcode_means)

## Geographic Clustering

Applied K-Means clustering (`n_clusters=5`) on standardized coordinates:
- Scaled `lat` and `long` features using `StandardScaler`
- Created `geo_cluster` feature for spatial segmentation

*Groups properties by geographic proximity for location-based patterns.*

In [21]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_coords = scaler.fit_transform(kc_house_data[['lat', 'long']])
test_coords = scaler.transform(test[['lat', 'long']])

kmeans = KMeans(n_clusters=5, random_state=42)
kc_house_data['geo_cluster'] = kmeans.fit_predict(train_coords)
test['geo_cluster'] = kmeans.predict(test_coords)


In [22]:
kc_house_data.to_csv('notebook_train_set.csv', index=False)


# Save the test set for feature engineering
test.to_csv('notebook_test_set.csv', index=False)