In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
import re
import os
import json
import joblib
from tqdm.auto import tqdm

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import optuna
from optuna.samplers import TPESampler

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("="*70)
print("  NYC REAL ESTATE PRICE PREDICTION - GRANDMASTER PIPELINE (FIXED)")
print("="*70)

class GrandmasterPreprocessor:
    def __init__(self):
        self.label_encoders = {}
        self.target = None
        self.test_ids = None
        self.train_columns = None
        
    def clean_column_names(self, df):
        new_columns = [re.sub(r'[^A-Za-z0-9_]+', '_', str(col)) for col in df.columns]
        df.columns = new_columns
        return df

    def clean_data(self, df, is_train=True):
        df = df.copy()

        placeholders = ['-', ' -', '- ', ' - ', '  -  ', '', ' ']
        df.replace(placeholders, np.nan, inplace=True)
 
        numeric_cols = ['LAND SQUARE FEET', 'GROSS SQUARE FEET', 
                        'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
                        'YEAR BUILT', 'SALE PRICE', 'ZIP CODE']
        
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

        if is_train and 'SALE PRICE' in df.columns:
            df = df.dropna(subset=['SALE PRICE'])
            df = df[df['SALE PRICE'] > 10_000].reset_index(drop=True)
            self.target = df['SALE PRICE'].copy()
            
        # 4. Outlier Clipping
        for col in ['LAND SQUARE FEET', 'GROSS SQUARE FEET']:
            if col in df.columns:
                upper = df[col].quantile(0.995)
                df.loc[df[col] > upper, col] = upper
                
        return df
    
    def engineer_features(self, df):
        df = df.copy()
        
        if 'SALE DATE' in df.columns:
            df['SALE DATE'] = pd.to_datetime(df['SALE DATE'], errors='coerce')
            df['sale_year'] = df['SALE DATE'].dt.year
            df['sale_month'] = df['SALE DATE'].dt.month
            
            if 'YEAR BUILT' in df.columns:
                df['building_age'] = df['sale_year'] - df['YEAR BUILT']
                df.loc[df['building_age'] < 0, 'building_age'] = 0 
            
            df.drop('SALE DATE', axis=1, inplace=True)
            
        for col in ['GROSS SQUARE FEET', 'LAND SQUARE FEET', 'TOTAL UNITS']:
            if col in df.columns:
                df[f'log_{col}'] = np.log1p(df[col].fillna(0))
            
        return df

    def encode_categorical(self, df, is_train=True):
        df = df.copy()
        drop_cols = ['ADDRESS', 'APARTMENT NUMBER', 'EASE-MENT']
        df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')
        
        cat_cols = df.select_dtypes(include=['object']).columns
        
        for col in cat_cols:
            if is_train:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str).fillna("Unknown"))
                self.label_encoders[col] = le
            else:
                if col in self.label_encoders:
                    le = self.label_encoders[col]
                    df[col] = df[col].astype(str).fillna("Unknown").apply(
                        lambda x: le.transform([x])[0] if x in le.classes_ else -1
                    )
        return df
    
    def transform(self, df, is_train=True):
        print(f"Preprocessing {'Training' if is_train else 'Test'} data...")
        
        df = self.clean_data(df, is_train)
        df = self.engineer_features(df)
        df = self.encode_categorical(df, is_train)
        
        num_cols = df.select_dtypes(include=np.number).columns
        df[num_cols] = df[num_cols].fillna(df[num_cols].median())
        
        if 'Property_ID' in df.columns:
            self.test_ids = df['Property_ID'].copy()
            
        cols_to_drop = ['Property_ID', 'SALE PRICE']
        df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)
        
        df = self.clean_column_names(df)
        
        # ALIGNMENT: Ensure Test has exact same columns as Train
        if is_train:
            self.train_columns = df.columns.tolist()
        else:
            if self.train_columns is not None:
                for col in self.train_columns:
                    if col not in df.columns:
                        df[col] = 0
                df = df[self.train_columns]
        
        print(f"‚úì Final Shape: {df.shape}")
        return df

# LOAD DATA
train_raw = pd.read_csv('/kaggle/input/dsc-nyc-real-estate/training.csv')
test_raw = pd.read_csv('/kaggle/input/dsc-nyc-real-estate/test.csv')

# EXECUTE PIPELINE
processor = GrandmasterPreprocessor()
X_train = processor.transform(train_raw, is_train=True)
y_train = np.log1p(processor.target)
X_test = processor.transform(test_raw, is_train=False)
test_ids = processor.test_ids

gc.collect()

class BayesianOptimizer:
    def __init__(self, X, y, n_trials=50, params_file='best_params.json'):
        self.X = X
        self.y = y
        self.n_trials = n_trials
        self.params_file = params_file
        self.best_params = {}
        
    def load_params(self):
        if os.path.exists(self.params_file):
            print(f"\n‚úì Loaded existing params from {self.params_file}")
            with open(self.params_file, 'r') as f:
                return json.load(f)
        return {}

    def objective_xgb(self, trial):
        params = {
            'n_estimators': 1000,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'random_state': 42,
            'n_jobs': -1,
            'tree_method': 'hist',  # Fast CPU Histogram method
            'early_stopping_rounds': 50
        }
        
        y_bins = pd.qcut(self.y, q=5, labels=False)
        kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scores = []
        
        for train_idx, val_idx in kf.split(self.X, y_bins):
            X_tr, X_val = self.X.iloc[train_idx], self.X.iloc[val_idx]
            y_tr, y_val = self.y.iloc[train_idx], self.y.iloc[val_idx]
            
            model = xgb.XGBRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            preds = model.predict(X_val)
            scores.append(np.sqrt(mean_squared_error(y_val, preds)))
            
        return np.mean(scores)

    def optimize(self):
        saved = self.load_params()
        if 'xgb' in saved:
            self.best_params = saved
            return saved
        
        print(f"\nüîç Optimizing XGBoost ({self.n_trials} trials)...")
        study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
        study.optimize(self.objective_xgb, n_trials=self.n_trials, show_progress_bar=True)
        
        self.best_params['xgb'] = study.best_params
        
        with open(self.params_file, 'w') as f:
            json.dump(self.best_params, f, indent=4)
            
        return self.best_params

optimizer = BayesianOptimizer(X_train, y_train, n_trials=50)
best_params = optimizer.optimize()

class UltimateEnsemble:
    def __init__(self, xgb_params, n_folds=5, model_dir='models'):
        self.xgb_params = xgb_params
        self.n_folds = n_folds
        self.model_dir = model_dir
        self.models = {'xgb': [], 'lgb': [], 'cat': [], 'rf': [], 'enet': []}
        self.meta_model = None
        self.feature_names = None
        
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

    def load_if_exists(self, filename):
        path = os.path.join(self.model_dir, filename)
        if os.path.exists(path):
            return joblib.load(path)
        return None

    def save_model(self, obj, filename):
        joblib.dump(obj, os.path.join(self.model_dir, filename))

    def train_with_cv(self, X, y):
        print("\nüöÄ ENSEMBLE TRAINING START")
        
        self.oof_preds = self.load_if_exists('oof_preds.pkl')
        loaded_models = self.load_if_exists('ensemble_models.pkl')
        self.feature_names = self.load_if_exists('feature_names.pkl')
        
        if self.oof_preds is not None and loaded_models is not None:
            print("‚úì Loaded trained models from checkpoint. Skipping training.")
            self.models = loaded_models
            return self
        
        self.feature_names = X.columns.tolist()
        self.save_model(self.feature_names, 'feature_names.pkl')
        
        y_bins = pd.qcut(y, q=10, labels=False, duplicates='drop')
        kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        oof = {k: np.zeros(len(X)) for k in self.models.keys()}
        
        for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y_bins), total=self.n_folds, desc="Folds")):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            xgb_m = xgb.XGBRegressor(**self.xgb_params, n_estimators=2000, 
                                     tree_method='hist', n_jobs=-1, # CPU HIST
                                     early_stopping_rounds=50, random_state=42)
            xgb_m.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            oof['xgb'][val_idx] = xgb_m.predict(X_val)
            self.models['xgb'].append(xgb_m)

            lgb_m = lgb.LGBMRegressor(n_estimators=2000, learning_rate=0.03, num_leaves=31,
                                      n_jobs=-1, random_state=42, verbosity=-1)
            lgb_m.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], 
                      callbacks=[lgb.early_stopping(50, verbose=False)])
            oof['lgb'][val_idx] = lgb_m.predict(X_val)
            self.models['lgb'].append(lgb_m)

            cat_m = cb.CatBoostRegressor(iterations=2000, learning_rate=0.03, depth=6,
                                         loss_function='RMSE', verbose=0, random_seed=42)
            cat_m.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50)
            oof['cat'][val_idx] = cat_m.predict(X_val)
            self.models['cat'].append(cat_m)

            rf_m = RandomForestRegressor(n_estimators=200, max_depth=12, max_features='sqrt',
                                         n_jobs=-1, random_state=42)
            rf_m.fit(X_tr, y_tr)
            oof['rf'][val_idx] = rf_m.predict(X_val)
            self.models['rf'].append(rf_m)

            enet_m = Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler()),
                ('model', ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42))
            ])
            enet_m.fit(X_tr, y_tr)
            oof['enet'][val_idx] = enet_m.predict(X_val)
            self.models['enet'].append(enet_m)
            
        self.oof_preds = pd.DataFrame(oof)
        self.save_model(self.models, 'ensemble_models.pkl')
        self.save_model(self.oof_preds, 'oof_preds.pkl')
        print("‚úì Training Complete. Models Saved.")
        return self

    def train_meta_learner(self, y):
        print("\nüèóÔ∏è Training Meta-Learner (Stacking)...")
        self.meta_model = Ridge(alpha=10.0, random_state=42)
        self.meta_model.fit(self.oof_preds, y)
        self.save_model(self.meta_model, 'meta_model.pkl')
        
        cv_score = np.sqrt(mean_squared_error(y, self.meta_model.predict(self.oof_preds)))
        print(f"  Stacked CV RMSE: {cv_score:.5f}")
        return self

    def predict(self, X):
        X = X.copy()
        
        if self.feature_names is None:
             self.feature_names = self.load_if_exists('feature_names.pkl')
             
        if self.feature_names:
            missing = set(self.feature_names) - set(X.columns)
            for c in missing: X[c] = 0
            extra = set(X.columns) - set(self.feature_names)
            if extra: X = X.drop(columns=list(extra))
            X = X[self.feature_names]
            
        print(f"‚úì Generating predictions with aligned shape: {X.shape}")
        
        test_preds = {}
        for name in self.models:
            preds = []
            for model in self.models[name]:
                preds.append(model.predict(X))
            test_preds[name] = np.mean(preds, axis=0)
            
        meta_X = pd.DataFrame(test_preds)
        return self.meta_model.predict(meta_X)

# TRAIN ENSEMBLE
ensemble = UltimateEnsemble(best_params.get('xgb', {}), n_folds=5)
ensemble.train_with_cv(X_train, y_train)
ensemble.train_meta_learner(y_train)

print("\n" + "="*70)
print("GENERATING FINAL SUBMISSION")
print("="*70)

final_log_preds = ensemble.predict(X_test)
final_preds = np.expm1(final_log_preds)
final_preds = np.maximum(final_preds, 0)

submission = pd.DataFrame({
    'Property_ID': test_ids,
    'PREDICTED': final_preds
})

submission.to_csv('submission.csv', index=False)
print(f"‚úì Saved {len(submission)} rows to 'submission.csv'")
print(submission.head())

  if entities is not ():


  NYC REAL ESTATE PRICE PREDICTION - GRANDMASTER PIPELINE (FIXED)
Preprocessing Training data...
‚úì Final Shape: (46745, 25)
Preprocessing Test data...
‚úì Final Shape: (14008, 25)

üîç Optimizing XGBoost (50 trials)...


  0%|          | 0/50 [00:00<?, ?it/s]


üöÄ ENSEMBLE TRAINING START


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

‚úì Training Complete. Models Saved.

üèóÔ∏è Training Meta-Learner (Stacking)...
  Stacked CV RMSE: 0.03367

GENERATING FINAL SUBMISSION
‚úì Generating predictions with aligned shape: (14008, 25)
‚úì Saved 14008 rows to 'submission.csv'
   Property_ID     PREDICTED
0        69521  20478.636131
1        76928  21433.525972
2        82053  22716.080652
3        56262  21703.474075
4        51915  22795.702675


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
import re
import os
import json
import joblib
from tqdm.auto import tqdm

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import optuna
from optuna.samplers import TPESampler

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("="*70)
print("  NYC REAL ESTATE - ULTIMATE GRANDMASTER PIPELINE (FIXED)")
print("="*70)


class TargetEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, cols, smoothing=10):
        self.cols = cols
        self.smoothing = smoothing
        self.maps = {}
        self.global_mean = None

    def fit(self, X, y):
        self.global_mean = y.mean()
        for col in self.cols:
            stats = pd.DataFrame({'sum': y.groupby(X[col]).sum(), 
                                  'count': y.groupby(X[col]).count()})
            self.maps[col] = (stats['sum'] + self.smoothing * self.global_mean) / (stats['count'] + self.smoothing)
        return self

    def transform(self, X):
        X_out = X.copy()
        for col in self.cols:
            if col in self.maps:
                X_out[col] = X_out[col].map(self.maps[col]).fillna(self.global_mean)
        return X_out

class GrandmasterPreprocessor:
    def __init__(self):
        self.label_encoders = {}
        self.target_encoder = None
        self.target = None
        self.test_ids = None
        self.train_columns = None
        
    def clean_column_names(self, df):
        new_columns = [re.sub(r'[^A-Za-z0-9_]+', '_', str(col)) for col in df.columns]
        df.columns = new_columns
        return df

    def clean_data(self, df, is_train=True):
        df = df.copy()

        placeholders = ['-', ' -', '- ', ' - ', '  -  ', '', ' ']
        df.replace(placeholders, np.nan, inplace=True)
        

        numeric_cols = ['LAND SQUARE FEET', 'GROSS SQUARE FEET', 
                        'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
                        'YEAR BUILT', 'SALE PRICE']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

        if is_train and 'SALE PRICE' in df.columns:
            df = df.dropna(subset=['SALE PRICE'])

            df = df[df['SALE PRICE'] > 10_000].reset_index(drop=True)
            self.target = df['SALE PRICE'].copy()

        for col in ['LAND SQUARE FEET', 'GROSS SQUARE FEET']:
            if col in df.columns:
                upper = df[col].quantile(0.995)
                df.loc[df[col] > upper, col] = upper

        drop_cols = ['Unnamed: 0', 'PriceScore', 'DistressedAsset', 'EASE-MENT']
        df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')
                
        return df
    
    def engineer_features(self, df):
        df = df.copy()

        if 'SALE DATE' in df.columns:
            df['SALE DATE'] = pd.to_datetime(df['SALE DATE'], errors='coerce')
            df['sale_year'] = df['SALE DATE'].dt.year
            df['sale_month'] = df['SALE DATE'].dt.month
            
            # Cyclical encoding
            df['month_sin'] = np.sin(2 * np.pi * df['sale_month']/12)
            df['month_cos'] = np.cos(2 * np.pi * df['sale_month']/12)
            
            if 'YEAR BUILT' in df.columns:
                df['building_age'] = df['sale_year'] - df['YEAR BUILT']
                df.loc[df['building_age'] < 0, 'building_age'] = 0
            
            df.drop('SALE DATE', axis=1, inplace=True)

        if 'GROSS SQUARE FEET' in df.columns and 'LAND SQUARE FEET' in df.columns:
            df['total_area'] = df['GROSS SQUARE FEET'].fillna(0) + df['LAND SQUARE FEET'].fillna(0)

        for col in ['GROSS SQUARE FEET', 'LAND SQUARE FEET', 'total_area', 'RESIDENTIAL UNITS']:
            if col in df.columns:
                df[f'log_{col}'] = np.log1p(df[col].fillna(0))
            
        return df

    def encode_categorical(self, df, is_train=True):
        df = df.copy()
        
        # Drop high cardinality
        df.drop(columns=['ADDRESS', 'APARTMENT NUMBER'], inplace=True, errors='ignore')
        
        # Prepare for Label Encoding (Skip Target Encode Cols)
        target_encode_cols = ['NEIGHBORHOOD', 'ZIP CODE', 'BOROUGH']
        cat_cols = df.select_dtypes(include=['object']).columns
        cat_cols = [c for c in cat_cols if c not in target_encode_cols]
        
        for col in cat_cols:
            if is_train:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str).fillna("Unknown"))
                self.label_encoders[col] = le
            else:
                if col in self.label_encoders:
                    le = self.label_encoders[col]
                    df[col] = df[col].astype(str).fillna("Unknown").apply(
                        lambda x: le.transform([x])[0] if x in le.classes_ else -1
                    )

        for col in target_encode_cols:
            if col in df.columns:
                df[col] = df[col].astype(str)
                
        return df
    
    def transform(self, df, is_train=True):
        print(f"Preprocessing {'Training' if is_train else 'Test'} data...")
        
        df = self.clean_data(df, is_train)
        df = self.engineer_features(df)
        df = self.encode_categorical(df, is_train)

        num_cols = df.select_dtypes(include=np.number).columns
        df[num_cols] = df[num_cols].fillna(df[num_cols].median())

        target_cols = ['NEIGHBORHOOD', 'ZIP CODE', 'BOROUGH']
        target_cols = [c for c in target_cols if c in df.columns]
        
        if is_train:
            self.target_encoder = TargetEncoder(cols=target_cols, smoothing=20)

            self.target_encoder.fit(df, np.log1p(self.target)) 
            df = self.target_encoder.transform(df)
        elif self.target_encoder:
            df = self.target_encoder.transform(df)
        if 'Property_ID' in df.columns:
            self.test_ids = df['Property_ID'].copy()
            
        cols_to_drop = ['Property_ID', 'SALE PRICE']
        df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True, errors='ignore')
        
        df = self.clean_column_names(df)
        if is_train:
            self.train_columns = df.columns.tolist()
        else:
            if self.train_columns:
                for col in self.train_columns:
                    if col not in df.columns: df[col] = 0
                df = df[self.train_columns]
        
        print(f"‚úì Final Shape: {df.shape}")
        return df

train_raw = pd.read_csv('/kaggle/input/dsc-nyc-real-estate/training.csv')
test_raw = pd.read_csv('/kaggle/input/dsc-nyc-real-estate/test.csv')

# RUN PIPELINE
processor = GrandmasterPreprocessor()
X_train = processor.transform(train_raw, is_train=True)
y_train = np.log1p(processor.target)
X_test = processor.transform(test_raw, is_train=False)
test_ids = processor.test_ids

gc.collect()

class BayesianOptimizer:
    def __init__(self, X, y, n_trials=30, params_file='best_params.json'):
        self.X = X
        self.y = y
        self.n_trials = n_trials
        self.params_file = params_file
        self.best_params = self.load_params()
        
    def load_params(self):
        if os.path.exists(self.params_file):
            print(f"\n‚úì Loaded parameters from {self.params_file}")
            with open(self.params_file, 'r') as f:
                return json.load(f)
        return {}
    
    def save_params(self):
        with open(self.params_file, 'w') as f:
            json.dump(self.best_params, f, indent=4)

    def objective_xgb(self, trial):
        params = {
            'n_estimators': 1000,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'random_state': 42,
            'n_jobs': -1,
            'tree_method': 'hist', 
            'early_stopping_rounds': 50
        }
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        scores = []
        for tr_idx, val_idx in kf.split(self.X):
            model = xgb.XGBRegressor(**params)
            model.fit(self.X.iloc[tr_idx], self.y.iloc[tr_idx], 
                      eval_set=[(self.X.iloc[val_idx], self.y.iloc[val_idx])], verbose=False)
            preds = model.predict(self.X.iloc[val_idx])
            scores.append(np.sqrt(mean_squared_error(self.y.iloc[val_idx], preds)))
        return np.mean(scores)

    def optimize(self):
        print("\nüîç STARTING BAYESIAN OPTIMIZATION")
        if 'xgb' not in self.best_params:
            study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))
            study.optimize(self.objective_xgb, n_trials=self.n_trials, show_progress_bar=True)
            self.best_params['xgb'] = study.best_params
            self.save_params()
            
        if 'lgb' not in self.best_params:
            # Default strong params for LGBM to save time
            self.best_params['lgb'] = {
                'n_estimators': 2000, 'learning_rate': 0.03, 'num_leaves': 31,
                'colsample_bytree': 0.8, 'subsample': 0.8
            }
            
        if 'cat' not in self.best_params:
            self.best_params['cat'] = {'iterations': 2000, 'learning_rate': 0.03, 'depth': 6}
            
        self.save_params()
        return self.best_params

optimizer = BayesianOptimizer(X_train, y_train, n_trials=50)
best_params = optimizer.optimize()
class UltimateEnsemble:
    def __init__(self, params, n_folds=5, model_dir='models'):
        self.params = params
        self.n_folds = n_folds
        self.model_dir = model_dir
        self.models = {'xgb': [], 'lgb': [], 'cat': [], 'rf': [], 'enet': []}
        self.meta_model = None
        self.feature_names = None
        
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

    def load_if_exists(self, filename):
        path = os.path.join(self.model_dir, filename)
        if os.path.exists(path):
            return joblib.load(path)
        return None

    def save_model(self, obj, filename):
        joblib.dump(obj, os.path.join(self.model_dir, filename))

    def train_with_cv(self, X, y):
        print("\nüöÄ ENSEMBLE TRAINING START")
        
        self.oof_preds = self.load_if_exists('oof_preds.pkl')
        loaded_models = self.load_if_exists('ensemble_models.pkl')
        self.feature_names = self.load_if_exists('feature_names.pkl')
        
        if self.oof_preds is not None and loaded_models is not None:
            print("‚úì Loaded trained models from checkpoint. Skipping training.")
            self.models = loaded_models
            return self
        
        self.feature_names = X.columns.tolist()
        self.save_model(self.feature_names, 'feature_names.pkl')
        
        y_bins = pd.qcut(y, q=10, labels=False, duplicates='drop')
        kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        oof = {k: np.zeros(len(X)) for k in self.models.keys()}
        
        for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y_bins), total=self.n_folds, desc="Folds")):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
            

            xgb_m = xgb.XGBRegressor(**self.params['xgb'], n_estimators=2000, 
                                     tree_method='hist', n_jobs=-1, early_stopping_rounds=50, random_state=42)
            xgb_m.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            oof['xgb'][val_idx] = xgb_m.predict(X_val)
            self.models['xgb'].append(xgb_m)

            lgb_m = lgb.LGBMRegressor(**self.params['lgb'], n_jobs=-1, random_state=42, verbosity=-1)
            lgb_m.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], 
                      callbacks=[lgb.early_stopping(50, verbose=False)])
            oof['lgb'][val_idx] = lgb_m.predict(X_val)
            self.models['lgb'].append(lgb_m)

            cat_m = cb.CatBoostRegressor(**self.params['cat'], loss_function='RMSE', verbose=0, random_seed=42)
            cat_m.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50)
            oof['cat'][val_idx] = cat_m.predict(X_val)
            self.models['cat'].append(cat_m)

            rf_m = RandomForestRegressor(n_estimators=200, max_depth=12, max_features='sqrt',
                                         n_jobs=-1, random_state=42)
            rf_m.fit(X_tr, y_tr)
            oof['rf'][val_idx] = rf_m.predict(X_val)
            self.models['rf'].append(rf_m)

            enet_m = Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler()),
                ('model', ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42))
            ])
            enet_m.fit(X_tr, y_tr)
            oof['enet'][val_idx] = enet_m.predict(X_val)
            self.models['enet'].append(enet_m)
            
        self.oof_preds = pd.DataFrame(oof)
        self.save_model(self.models, 'ensemble_models.pkl')
        self.save_model(self.oof_preds, 'oof_preds.pkl')
        print("‚úì Training Complete.")
        return self

    def train_meta_learner(self, y):
        print("\nüèóÔ∏è Training Meta-Learner...")
        self.meta_model = Ridge(alpha=10.0, random_state=42)
        self.meta_model.fit(self.oof_preds, y)
        self.save_model(self.meta_model, 'meta_model.pkl')
        
        cv_score = np.sqrt(mean_squared_error(y, self.meta_model.predict(self.oof_preds)))
        print(f"  Stacked CV RMSE: {cv_score:.5f}")
        return self

    def predict(self, X):
        X = X.copy()
        
        if self.feature_names is None:
             self.feature_names = self.load_if_exists('feature_names.pkl')
             
        if self.feature_names:
            missing = set(self.feature_names) - set(X.columns)
            for c in missing: X[c] = 0
            extra = set(X.columns) - set(self.feature_names)
            if extra: X = X.drop(columns=list(extra))
            X = X[self.feature_names]
            
        print(f"‚úì Predicting with shape: {X.shape}")
        
        test_preds = {}
        for name in self.models:
            preds = []
            for model in self.models[name]:
                preds.append(model.predict(X))
            test_preds[name] = np.mean(preds, axis=0)
            
        meta_X = pd.DataFrame(test_preds)
        return self.meta_model.predict(meta_X)

ensemble = UltimateEnsemble(best_params, n_folds=5)
ensemble.train_with_cv(X_train, y_train)
ensemble.train_meta_learner(y_train)

print("\n" + "="*70)
print("GENERATING FINAL SUBMISSION")
print("="*70)

final_log_preds = ensemble.predict(X_test)
final_preds = np.expm1(final_log_preds)
final_preds = np.maximum(final_preds, 0)

submission = pd.DataFrame({
    'Property_ID': test_ids,
    'PREDICTED': final_preds
})

submission.to_csv('submission.csv', index=False)
print(f"‚úì Saved {len(submission)} rows to 'submission.csv'")
print(submission.head())

  NYC REAL ESTATE - ULTIMATE GRANDMASTER PIPELINE (FIXED)
Preprocessing Training data...
‚úì Final Shape: (46745, 26)
Preprocessing Test data...
‚úì Final Shape: (14008, 26)

‚úì Loaded parameters from best_params.json

üîç STARTING BAYESIAN OPTIMIZATION

üöÄ ENSEMBLE TRAINING START
‚úì Loaded trained models from checkpoint. Skipping training.

üèóÔ∏è Training Meta-Learner...
  Stacked CV RMSE: 0.03367

GENERATING FINAL SUBMISSION
‚úì Predicting with shape: (14008, 25)
‚úì Saved 14008 rows to 'submission.csv'
   Property_ID     PREDICTED
0        69521  21376.578016
1        76928  21498.392205
2        82053  22128.706413
3        56262  21404.850234
4        51915  22892.234162
