In [None]:
print("\n2. DATA PREPARATION")
print("-" * 40)

# Remove price_display
print("Removing 'price_display' ")
if 'price_display' in df.columns:
    df = df.drop('price_display', axis=1)

# feature engineering, adding new columns
class ApartmentFeatureEngineer(BaseEstimator, TransformerMixin):

    def __init__(self, use_new_features=True):
        self.use_new_features = use_new_features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()

        if self.use_new_features:
            # Feature 1: Square feet availability indicator
            if 'square_feet' in X_transformed.columns:
                X_transformed['has_square_feet'] = (~X_transformed['square_feet'].isna()).astype(int)
                # Feature: Price per square foot ratio (log-transformed for better distribution)
                X_transformed['sqft_log'] = np.log1p(X_transformed['square_feet'].fillna(X_transformed['square_feet'].median()))

            # Feature 2: Amenity count and specific amenities
            if 'amenities' in X_transformed.columns:
                amenities_filled = X_transformed['amenities'].fillna('')
                X_transformed['amenity_count'] = amenities_filled.apply(
                    lambda x: len([a.strip() for a in x.split(',') if a.strip()]) if x else 0
                )
                # indicators for valuable amenities
                X_transformed['has_dishwasher'] = amenities_filled.str.contains('Dishwasher', case=False, na=False).astype(int)
                X_transformed['has_elevator'] = amenities_filled.str.contains('Elevator', case=False, na=False).astype(int)
                X_transformed['has_pool'] = amenities_filled.str.contains('Pool', case=False, na=False).astype(int)
                X_transformed['has_parking'] = amenities_filled.str.contains('Parking|Garage', case=False, na=False).astype(int)

            # Feature 3: Location-based features
            if 'state' in X_transformed.columns:
                high_cost_states = ['NY', 'CA', 'WA', 'MA']
                medium_cost_states = ['DC', 'VA', 'IL']
                X_transformed['high_cost_location'] = X_transformed['state'].isin(high_cost_states).astype(int)
                X_transformed['medium_cost_location'] = X_transformed['state'].isin(medium_cost_states).astype(int)

            # Feature 4: Room-based features
            if 'bedrooms' in X_transformed.columns and 'bathrooms' in X_transformed.columns:
                bedrooms_filled = X_transformed['bedrooms'].fillna(0)
                bathrooms_filled = X_transformed['bathrooms'].fillna(0)
                X_transformed['total_rooms'] = bedrooms_filled + bathrooms_filled
                # Room efficiency ratio
                X_transformed['room_efficiency'] = np.where(
                    X_transformed['square_feet'].notna() & (X_transformed['square_feet'] > 0),
                    X_transformed['total_rooms'] / (X_transformed['square_feet'] / 100),
                    X_transformed['total_rooms'].fillna(0)
                )
                # Studio apartment indicator (0 bedrooms)
                X_transformed['is_studio'] = (bedrooms_filled == 0).astype(int)

            # Feature 5: Geographic clustering
            if 'latitude' in X_transformed.columns and 'longitude' in X_transformed.columns:
                # Create location clusters based on lat/lon
                lat_filled = X_transformed['latitude'].fillna(X_transformed['latitude'].median())
                lon_filled = X_transformed['longitude'].fillna(X_transformed['longitude'].median())

                # Simple geographic binning
                X_transformed['lat_bin'] = pd.cut(lat_filled, bins=5, labels=False)
                X_transformed['lon_bin'] = pd.cut(lon_filled, bins=5, labels=False)

        return X_transformed

# Prepare features and target


def prepare_features(df):
    df_prep = df.copy()

    # Fill missing values
    df_prep['bedrooms'] = df_prep['bedrooms'].fillna(df_prep['bedrooms'].median())
    df_prep['bathrooms'] = df_prep['bathrooms'].fillna(df_prep['bathrooms'].median())
    df_prep['square_feet'] = df_prep['square_feet'].fillna(df_prep['square_feet'].median())
    df_prep['amenities'] = df_prep['amenities'].fillna('')

    return df_prep

df_prepared = prepare_features(df)

# Select features for modeling(columns that are deemed good enough for indication)
feature_columns = ['bedrooms', 'bathrooms', 'square_feet', 'amenities', 'cityname', 'state', 'latitude', 'longitude']
X = df_prepared[feature_columns].copy()
y = df_prepared['price'].copy()

print(f"Features selected: {feature_columns}")
print(f"Target variable: price")
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Apply feature engineering
feature_engineer = ApartmentFeatureEngineer(use_new_features=True)
X_engineered = feature_engineer.transform(X)

print(f"After feature engineering: {X_engineered.shape}")
print("New features created:", [col for col in X_engineered.columns if col not in feature_columns])

# Manual preprocessing
def preprocess_data(X_eng, use_engineered_features=True):
    X_processed = X_eng.copy()


    le_city = LabelEncoder()
    le_state = LabelEncoder()

    X_processed['cityname_encoded'] = le_city.fit_transform(X_processed['cityname'].astype(str))
    X_processed['state_encoded'] = le_state.fit_transform(X_processed['state'].astype(str))

    # Select final features for modeling based on what's available
    basic_features = ['bedrooms', 'bathrooms', 'square_feet', 'latitude', 'longitude',
                     'cityname_encoded', 'state_encoded']

    if use_engineered_features:
        # Only include engineered features if they exist in the dataframe
        engineered_features = []
        potential_features = ['amenity_count', 'has_square_feet', 'high_cost_location', 'medium_cost_location',
                             'total_rooms', 'sqft_log', 'has_dishwasher', 'has_elevator', 'has_pool',
                             'has_parking', 'room_efficiency', 'is_studio', 'lat_bin', 'lon_bin']
        for feat in potential_features:
            if feat in X_processed.columns:
                engineered_features.append(feat)

        final_features = basic_features + engineered_features
    else:
        final_features = basic_features

    X_final = X_processed[final_features].copy()

    # Fill any remaining missing values
    for col in X_final.columns:
        if X_final[col].dtype in ['float64', 'int64']:
            X_final[col] = X_final[col].fillna(X_final[col].median())
        else:
            X_final[col] = X_final[col].fillna(X_final[col].mode().iloc[0] if len(X_final[col].mode()) > 0 else 0)

    return X_final, le_city, le_state

X_processed, label_encoder_city, label_encoder_state = preprocess_data(X_engineered, use_engineered_features=True)

print(f"Final processed features shape: {X_processed.shape}")
print("Final features:", list(X_processed.columns))

# Create price categories for Naive Bayes classification
print("\nCreating price categories for Naive Bayes classification...")

# Strategy 1: Try fewer categories first (3 instead of 5)
print("Testing different binning strategies...")

# 3-category binning (more samples per category)
price_quantiles_3 = y.quantile([0.33, 0.67]).values
price_bins_3 = [-np.inf] + list(price_quantiles_3) + [np.inf]
price_labels_3 = ['Low', 'Medium', 'High']
y_categorical_3 = pd.cut(y, bins=price_bins_3, labels=price_labels_3)

print("3-category distribution:")
print(y_categorical_3.value_counts().sort_index())

# 5-category binning (original)
price_quantiles_5 = y.quantile([0.2, 0.4, 0.6, 0.8]).values
price_bins_5 = [-np.inf] + list(price_quantiles_5) + [np.inf]
price_labels_5 = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
y_categorical_5 = pd.cut(y, bins=price_bins_5, labels=price_labels_5)

print("\n5-category distribution:")
print(y_categorical_5.value_counts().sort_index())

# Use 3-category for better performance initially
y_categorical = y_categorical_3
price_labels = price_labels_3

# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test, y_train_cat, y_test_cat = train_test_split(
    X_processed, y, y_categorical, test_size=0.2
    , stratify=y_categorical
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Scale the features for better Naive Bayes performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
