In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv("data.csv")

# Feature engineering
def feature_engineering(df):
    df = df.copy()
    df['log_price'] = np.log1p(df['price'])
    df['house_age'] = 2015 - df['yr_built']
    df['renovated_flag'] = (df['yr_renovated'] > 0).astype(int)
    df['bath_per_bed'] = df['bathrooms'] / (df['bedrooms'] + 1e-3)
    df['lot_ratio'] = df['sqft_living'] / (df['sqft_lot'] + 1e-3)
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df['area_per_room'] = df['sqft_living'] / (df['total_rooms'] + 1e-3)
    df['lat'] = df['street'].apply(lambda x: hash(str(x)) % 1000 / 10)
    df['long'] = df['statezip'].apply(lambda x: hash(str(x)) % 1000 / 10)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

df_fe = feature_engineering(df)
df_fe['location_cluster'] = KMeans(n_clusters=5, random_state=42).fit_predict(df_fe[['lat', 'long']])

# Drop unnecessary
drop_cols = ['price', 'date', 'street', 'city', 'statezip', 'country']
X = df_fe.drop(columns=drop_cols + ['log_price'], errors='ignore')
y = df_fe['log_price']

# Identify column types
categorical_features = ['condition', 'view', 'waterfront', 'floors', 'location_cluster']
numeric_features = [col for col in X.columns if col not in categorical_features]

# Preprocessing
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# KFold + XGBoost
kf = KFold(n_splits=5, shuffle=True, random_state=42)
preds = []
y_valids = []

for train_idx, valid_idx in kf.split(X):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = Pipeline([
        ('pre', preprocessor),
        ('xgb', XGBRegressor(
            objective='reg:squarederror',
            n_estimators=150,
            learning_rate=0.1,
            max_depth=4,
            random_state=42
        ))
    ])

    model.fit(X_train, y_train)
    y_pred = np.expm1(model.predict(X_valid))
    y_true = np.expm1(y_valid)

    preds.append(y_pred)
    y_valids.append(y_true)

# Concatenate results
y_pred_all = np.concatenate(preds)
y_true_all = np.concatenate(y_valids)

# Evaluation
mae = mean_absolute_error(y_true_all, y_pred_all)
rmse = mean_squared_error(y_true_all, y_pred_all, squared=False)
r2 = r2_score(y_true_all, y_pred_all)

print(f"KFold XGBoost → MAE: {mae:,.0f} | RMSE: {rmse:,.0f} | R²: {r2:.3f}")

KeyboardInterrupt: 