In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from google.colab import drive



drive.mount('/content/drive')
path = '/content/drive/My Drive/BigData_grp_assignment/apartments10.csv'


df = pd.read_csv(path, sep=None, engine="python", encoding="cp1252")


df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(r"\s+", "_", regex=True))

print(df.shape)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(10000, 22)


Unnamed: 0,id,category,title,body,amenities,bathrooms,bedrooms,currency,fee,has_photo,...,price_display,price_type,square_feet,address,cityname,state,latitude,longitude,source,time
0,5668626895,housing/rent/apartment,"Studio apartment 2nd St NE, Uhland Terrace NE,...","This unit is located at second St NE, Uhland T...",,,0.0,USD,No,Thumbnail,...,$790,Monthly,101,,Washington,DC,38.9057,-76.9861,RentLingo,1577359415
1,5664597177,housing/rent/apartment,Studio apartment 814 Schutte Road,"This unit is located at 814 Schutte Road, Evan...",,,1.0,USD,No,Thumbnail,...,$425,Monthly,106,814 Schutte Rd,Evansville,IN,37.968,-87.6621,RentLingo,1577017063
2,5668626833,housing/rent/apartment,"Studio apartment N Scott St, 14th St N, Arling...","This unit is located at N Scott St, 14th St N,...",,1.0,0.0,USD,No,Thumbnail,...,"$1,390",Monthly,107,,Arlington,VA,38.891,-77.0816,RentLingo,1577359410
3,5659918074,housing/rent/apartment,Studio apartment 1717 12th Ave,"This unit is located at 1717 12th Ave, Seattle...",,1.0,0.0,USD,No,Thumbnail,...,$925,Monthly,116,1717 12th Avenue,Seattle,WA,47.616,-122.3275,RentLingo,1576667743
4,5668626759,housing/rent/apartment,"Studio apartment Washington Blvd, N Cleveland ...","This unit is located at Washington Blvd, N Cle...",,,0.0,USD,No,Thumbnail,...,$880,Monthly,125,,Arlington,VA,38.8738,-77.1055,RentLingo,1577359401


In [None]:
print("\n2. DATA PREPARATION")
print("-" * 40)

# Remove price_display
print("Removing 'price_display' ")
if 'price_display' in df.columns:
    df = df.drop('price_display', axis=1)

# feature engineering, adding new columns
class ApartmentFeatureEngineer(BaseEstimator, TransformerMixin):

    def __init__(self, use_new_features=True):
        self.use_new_features = use_new_features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()

        if self.use_new_features:
            # Feature 1: Square feet availability indicator
            if 'square_feet' in X_transformed.columns:
                X_transformed['has_square_feet'] = (~X_transformed['square_feet'].isna()).astype(int)
                # Feature: Price per square foot ratio (log-transformed for better distribution)
                X_transformed['sqft_log'] = np.log1p(X_transformed['square_feet'].fillna(X_transformed['square_feet'].median()))

            # Feature 2: Amenity count and specific amenities
            if 'amenities' in X_transformed.columns:
                amenities_filled = X_transformed['amenities'].fillna('')
                X_transformed['amenity_count'] = amenities_filled.apply(
                    lambda x: len([a.strip() for a in x.split(',') if a.strip()]) if x else 0
                )
                # indicators for valuable amenities
                X_transformed['has_dishwasher'] = amenities_filled.str.contains('Dishwasher', case=False, na=False).astype(int)
                X_transformed['has_elevator'] = amenities_filled.str.contains('Elevator', case=False, na=False).astype(int)
                X_transformed['has_pool'] = amenities_filled.str.contains('Pool', case=False, na=False).astype(int)
                X_transformed['has_parking'] = amenities_filled.str.contains('Parking|Garage', case=False, na=False).astype(int)

            # Feature 3: Location-based features
            if 'state' in X_transformed.columns:
                high_cost_states = ['NY', 'CA', 'WA', 'MA']
                medium_cost_states = ['DC', 'VA', 'IL']
                X_transformed['high_cost_location'] = X_transformed['state'].isin(high_cost_states).astype(int)
                X_transformed['medium_cost_location'] = X_transformed['state'].isin(medium_cost_states).astype(int)

            # Feature 4: Room-based features
            if 'bedrooms' in X_transformed.columns and 'bathrooms' in X_transformed.columns:
                bedrooms_filled = X_transformed['bedrooms'].fillna(0)
                bathrooms_filled = X_transformed['bathrooms'].fillna(0)
                X_transformed['total_rooms'] = bedrooms_filled + bathrooms_filled
                # Room efficiency ratio
                X_transformed['room_efficiency'] = np.where(
                    X_transformed['square_feet'].notna() & (X_transformed['square_feet'] > 0),
                    X_transformed['total_rooms'] / (X_transformed['square_feet'] / 100),
                    X_transformed['total_rooms'].fillna(0)
                )
                # Studio apartment indicator (0 bedrooms)
                X_transformed['is_studio'] = (bedrooms_filled == 0).astype(int)

            # Feature 5: Geographic clustering
            if 'latitude' in X_transformed.columns and 'longitude' in X_transformed.columns:
                # Create location clusters based on lat/lon
                lat_filled = X_transformed['latitude'].fillna(X_transformed['latitude'].median())
                lon_filled = X_transformed['longitude'].fillna(X_transformed['longitude'].median())

                # Simple geographic binning
                X_transformed['lat_bin'] = pd.cut(lat_filled, bins=5, labels=False)
                X_transformed['lon_bin'] = pd.cut(lon_filled, bins=5, labels=False)

        return X_transformed

# Prepare features and target


def prepare_features(df):
    df_prep = df.copy()

    # Fill missing values
    df_prep['bedrooms'] = df_prep['bedrooms'].fillna(df_prep['bedrooms'].median())
    df_prep['bathrooms'] = df_prep['bathrooms'].fillna(df_prep['bathrooms'].median())
    df_prep['square_feet'] = df_prep['square_feet'].fillna(df_prep['square_feet'].median())
    df_prep['amenities'] = df_prep['amenities'].fillna('')

    return df_prep

df_prepared = prepare_features(df)

# Select features for modeling(columns that are deemed good enough for indication)
feature_columns = ['bedrooms', 'bathrooms', 'square_feet', 'amenities', 'cityname', 'state', 'latitude', 'longitude']
X = df_prepared[feature_columns].copy()
y = df_prepared['price'].copy()

print(f"Features selected: {feature_columns}")
print(f"Target variable: price")
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Apply feature engineering
feature_engineer = ApartmentFeatureEngineer(use_new_features=True)
X_engineered = feature_engineer.transform(X)

print(f"After feature engineering: {X_engineered.shape}")
print("New features created:", [col for col in X_engineered.columns if col not in feature_columns])

# Manual preprocessing
def preprocess_data(X_eng, use_engineered_features=True):
    X_processed = X_eng.copy()


    le_city = LabelEncoder()
    le_state = LabelEncoder()

    X_processed['cityname_encoded'] = le_city.fit_transform(X_processed['cityname'].astype(str))
    X_processed['state_encoded'] = le_state.fit_transform(X_processed['state'].astype(str))

    # Select final features for modeling based on what's available
    basic_features = ['bedrooms', 'bathrooms', 'square_feet', 'latitude', 'longitude',
                     'cityname_encoded', 'state_encoded']

    if use_engineered_features:
        # Only include engineered features if they exist in the dataframe
        engineered_features = []
        potential_features = ['amenity_count', 'has_square_feet', 'high_cost_location', 'medium_cost_location',
                             'total_rooms', 'sqft_log', 'has_dishwasher', 'has_elevator', 'has_pool',
                             'has_parking', 'room_efficiency', 'is_studio', 'lat_bin', 'lon_bin']
        for feat in potential_features:
            if feat in X_processed.columns:
                engineered_features.append(feat)

        final_features = basic_features + engineered_features
    else:
        final_features = basic_features

    X_final = X_processed[final_features].copy()

    # Fill any remaining missing values
    for col in X_final.columns:
        if X_final[col].dtype in ['float64', 'int64']:
            X_final[col] = X_final[col].fillna(X_final[col].median())
        else:
            X_final[col] = X_final[col].fillna(X_final[col].mode().iloc[0] if len(X_final[col].mode()) > 0 else 0)

    return X_final, le_city, le_state

X_processed, label_encoder_city, label_encoder_state = preprocess_data(X_engineered, use_engineered_features=True)

print(f"Final processed features shape: {X_processed.shape}")
print("Final features:", list(X_processed.columns))

# Create price categories for Naive Bayes classification
print("\nCreating price categories for Naive Bayes classification...")

# Strategy 1: Try fewer categories first (3 instead of 5)
print("Testing different binning strategies...")

# 3-category binning (more samples per category)
price_quantiles_3 = y.quantile([0.33, 0.67]).values
price_bins_3 = [-np.inf] + list(price_quantiles_3) + [np.inf]
price_labels_3 = ['Low', 'Medium', 'High']
y_categorical_3 = pd.cut(y, bins=price_bins_3, labels=price_labels_3)

print("3-category distribution:")
print(y_categorical_3.value_counts().sort_index())

# 5-category binning (original)
price_quantiles_5 = y.quantile([0.2, 0.4, 0.6, 0.8]).values
price_bins_5 = [-np.inf] + list(price_quantiles_5) + [np.inf]
price_labels_5 = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
y_categorical_5 = pd.cut(y, bins=price_bins_5, labels=price_labels_5)

print("\n5-category distribution:")
print(y_categorical_5.value_counts().sort_index())

# Use 3-category for better performance initially
y_categorical = y_categorical_3
price_labels = price_labels_3

# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test, y_train_cat, y_test_cat = train_test_split(
    X_processed, y, y_categorical, test_size=0.2
    , stratify=y_categorical
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Scale the features for better Naive Bayes performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



2. DATA PREPARATION
----------------------------------------
Removing 'price_display' 
Features selected: ['bedrooms', 'bathrooms', 'square_feet', 'amenities', 'cityname', 'state', 'latitude', 'longitude']
Target variable: price
Feature matrix shape: (10000, 8)
Target vector shape: (10000,)
After feature engineering: (10000, 22)
New features created: ['has_square_feet', 'sqft_log', 'amenity_count', 'has_dishwasher', 'has_elevator', 'has_pool', 'has_parking', 'high_cost_location', 'medium_cost_location', 'total_rooms', 'room_efficiency', 'is_studio', 'lat_bin', 'lon_bin']
Final processed features shape: (10000, 21)
Final features: ['bedrooms', 'bathrooms', 'square_feet', 'latitude', 'longitude', 'cityname_encoded', 'state_encoded', 'amenity_count', 'has_square_feet', 'high_cost_location', 'medium_cost_location', 'total_rooms', 'sqft_log', 'has_dishwasher', 'has_elevator', 'has_pool', 'has_parking', 'room_efficiency', 'is_studio', 'lat_bin', 'lon_bin']

Creating price categories for Nai

In [None]:
print("\n3. NAIVE BAYES MODEL TRAINING")
print("-" * 40)

# Initialize Naive Bayes model
nb_model = GaussianNB()

# Train the model
print("Training Naive Bayes classifier...")
nb_model.fit(X_train_scaled, y_train_cat)

# Make predictions
y_pred_cat = nb_model.predict(X_test_scaled)
y_pred_proba = nb_model.predict_proba(X_test_scaled)

print("Naive Bayes classification completed!")

# Classification metrics
accuracy = accuracy_score(y_test_cat, y_pred_cat)
print(f"\nClassification Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_cat, y_pred_cat))

# Convert categorical predictions back to price estimates for regression metrics
print("\nConverting categorical predictions to price estimates...")

# Calculate mean price for each category from training data
category_price_map = {}
for category in price_labels:
    mask = y_train_cat == category
    if mask.sum() > 0:
        category_price_map[category] = y_train[mask].mean()
    else:
        category_price_map[category] = y_train.mean()

print("Category to price mapping:")
for cat, price in category_price_map.items():
    print(f"  {cat}: ${price:,.2f}")

# Convert predictions to price estimates
y_pred_price = [category_price_map[pred] for pred in y_pred_cat]

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred_price)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_price)
r2 = r2_score(y_test, y_pred_price)

print(f"\nRegression Metrics:")
print(f"RMSE: ${rmse:,.2f}")
print(f"MAE: ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")



3. NAIVE BAYES MODEL TRAINING
----------------------------------------
Training Naive Bayes classifier...
Naive Bayes classification completed!

Classification Accuracy: 0.5775

Classification Report:
              precision    recall  f1-score   support

        High       0.71      0.59      0.64       657
         Low       0.55      0.82      0.66       663
      Medium       0.48      0.33      0.39       680

    accuracy                           0.58      2000
   macro avg       0.58      0.58      0.56      2000
weighted avg       0.58      0.58      0.56      2000


Converting categorical predictions to price estimates...
Category to price mapping:
  Low: $823.70
  Medium: $1,278.80
  High: $2,391.14

Regression Metrics:
RMSE: $744.58
MAE: $486.52
R² Score: 0.2162


In [None]:
print("\n4. HYPERPARAMETER TUNING")
print("-" * 40)


# Parameter grid for Naive Bayes
param_grid = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    GaussianNB(),
    param_grid,
    cv=5,
    scoring='accuracy',  # Using accuracy for classification
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train_cat)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Test different binning strategies
print("\nTesting different binning strategies...")

binning_results = {}

# Test 3-category binning
y_train_cat_3 = pd.cut(y_train, bins=[-np.inf] + list(y.quantile([0.33, 0.67]).values) + [np.inf],
                       labels=['Low', 'Medium', 'High'])
y_test_cat_3 = pd.cut(y_test, bins=[-np.inf] + list(y.quantile([0.33, 0.67]).values) + [np.inf],
                      labels=['Low', 'Medium', 'High'])

nb_3 = GaussianNB(var_smoothing=grid_search.best_params_['var_smoothing'])
nb_3.fit(X_train_scaled, y_train_cat_3)
acc_3 = accuracy_score(y_test_cat_3, nb_3.predict(X_test_scaled))
binning_results['3-category'] = acc_3

# Test 4-category binning
y_train_cat_4 = pd.cut(y_train, bins=[-np.inf] + list(y.quantile([0.25, 0.5, 0.75]).values) + [np.inf],
                       labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
y_test_cat_4 = pd.cut(y_test, bins=[-np.inf] + list(y.quantile([0.25, 0.5, 0.75]).values) + [np.inf],
                      labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

nb_4 = GaussianNB(var_smoothing=grid_search.best_params_['var_smoothing'])
nb_4.fit(X_train_scaled, y_train_cat_4)
acc_4 = accuracy_score(y_test_cat_4, nb_4.predict(X_test_scaled))
binning_results['4-category'] = acc_4

# Test 5-category binning (original)
y_train_cat_5 = pd.cut(y_train, bins=[-np.inf] + list(y.quantile([0.2, 0.4, 0.6, 0.8]).values) + [np.inf],
                       labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
y_test_cat_5 = pd.cut(y_test, bins=[-np.inf] + list(y.quantile([0.2, 0.4, 0.6, 0.8]).values) + [np.inf],
                      labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

nb_5 = GaussianNB(var_smoothing=grid_search.best_params_['var_smoothing'])
nb_5.fit(X_train_scaled, y_train_cat_5)
acc_5 = accuracy_score(y_test_cat_5, nb_5.predict(X_test_scaled))
binning_results['5-category'] = acc_5

print("Binning strategy results:")
for strategy, accuracy in binning_results.items():
    print(f"  {strategy}: {accuracy:.4f}")

# Use the best binning strategy
best_binning = max(binning_results.items(), key=lambda x: x[1])
print(f"\nBest binning strategy: {best_binning[0]} with accuracy {best_binning[1]:.4f}")

# Update the categorical variables to use the best binning
if best_binning[0] == '3-category':
    y_train_cat = y_train_cat_3
    y_test_cat = y_test_cat_3
    price_labels = ['Low', 'Medium', 'High']
    best_nb_model = nb_3
elif best_binning[0] == '4-category':
    y_train_cat = y_train_cat_4
    y_test_cat = y_test_cat_4
    price_labels = ['Low', 'Medium-Low', 'Medium-High', 'High']
    best_nb_model = nb_4
else:
    y_train_cat = y_train_cat_5
    y_test_cat = y_test_cat_5
    price_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
    best_nb_model = nb_5

# Test with and without feature engineering using the best binning
print(f"\nTesting feature engineering impact with {best_binning[0]} binning...")

# Without feature engineering
feature_engineer_basic = ApartmentFeatureEngineer(use_new_features=False)
X_basic = feature_engineer_basic.transform(X[feature_columns])
X_basic_processed, _, _ = preprocess_data(X_basic, use_engineered_features=False)

X_train_basic, X_test_basic = train_test_split(
    X_basic_processed, test_size=0.2, random_state=42, stratify=y_categorical
)[0:2]

X_train_basic_scaled = scaler.fit_transform(X_train_basic)
X_test_basic_scaled = scaler.transform(X_test_basic)

# Train basic model with best binning
nb_basic = GaussianNB(var_smoothing=grid_search.best_params_['var_smoothing'])
nb_basic.fit(X_train_basic_scaled, y_train_cat)
y_pred_basic = nb_basic.predict(X_test_basic_scaled)

accuracy_basic = accuracy_score(y_test_cat, y_pred_basic)

print(f"Accuracy without feature engineering: {accuracy_basic:.4f}")
print(f"Accuracy with feature engineering: {best_binning[1]:.4f}")
print(f"Feature engineering improvement: {best_binning[1] - accuracy_basic:.4f}")

# Use the best model for final evaluation
final_pred_cat = best_nb_model.predict(X_test_scaled)
final_pred_proba = best_nb_model.predict_proba(X_test_scaled)
final_pred_price = [category_price_map[pred] for pred in final_pred_cat]

# Final metrics
final_accuracy = accuracy_score(y_test_cat, final_pred_cat)
final_mse = mean_squared_error(y_test, final_pred_price)
final_rmse = np.sqrt(final_mse)
final_mae = mean_absolute_error(y_test, final_pred_price)
final_r2 = r2_score(y_test, final_pred_price)

print(f"\nFinal Tuned Model Performance:")
print(f"Classification Accuracy: {final_accuracy:.4f}")
print(f"RMSE: ${final_rmse:,.2f}")
print(f"MAE: ${final_mae:,.2f}")
print(f"R² Score: {final_r2:.4f}")


4. HYPERPARAMETER TUNING
----------------------------------------
Best parameters: {'var_smoothing': 1e-10}
Best cross-validation accuracy: 0.5717

Testing different binning strategies...
Binning strategy results:
  3-category: 0.5775
  4-category: 0.4495
  5-category: 0.3740

Best binning strategy: 3-category with accuracy 0.5775

Testing feature engineering impact with 3-category binning...
Accuracy without feature engineering: 0.3350
Accuracy with feature engineering: 0.5775
Feature engineering improvement: 0.2425

Final Tuned Model Performance:
Classification Accuracy: 0.5775
RMSE: $744.58
MAE: $486.52
R² Score: 0.2162
