# Import necessary libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load and prepare the data


In [21]:
df = pd.read_csv('/content/Housing.csv')

In [22]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [24]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [25]:
categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating',
                      'airconditioning', 'prefarea', 'furnishingstatus']
for col in categorical_columns:
    df[col] = df[col].astype('category')

In [26]:
if df.isnull().sum().sum() > 0:
    print("Missing values found:")
    print(df.isnull().sum())

# Step 2: Feature engineering

In [27]:
def create_features(df):
    df['area_per_bedroom'] = df['area'] / df['bedrooms']
    df['bed_bath_ratio'] = df['bedrooms'] / df['bathrooms'].replace(0, 1)
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df['parking_per_room'] = df['parking'] / (df['bedrooms'] + df['bathrooms']).replace(0, 1)
    df['luxury_score'] = (df['airconditioning'].cat.codes +
                         df['hotwaterheating'].cat.codes * 2 +
                         df['basement'].cat.codes +
                         df['guestroom'].cat.codes +
                         df['prefarea'].cat.codes * 2)
    df['area_per_story'] = df['area'] / df['stories'].replace(0, 1)
    df['amenities_count'] = df[['mainroad', 'guestroom', 'basement',
                               'hotwaterheating', 'airconditioning',
                               'prefarea']].eq('yes').sum(axis=1)
    df['luxury_area'] = df['luxury_score'] * df['area']
    df['stories_parking'] = df['stories'] * df['parking']
    return df

df = create_features(df)

# Step 3: Handle outliers (remove top and bottom 1%)


In [28]:
q_low = df["price"].quantile(0.01)
q_hi = df["price"].quantile(0.99)
df = df[(df["price"] < q_hi) & (df["price"] > q_low)]

# Step 4: Separate features and target with log transformation

In [30]:
X = df.drop('price', axis=1)
y = np.log1p(df['price'])

# Step 5: Define numeric and categorical features


In [31]:
numeric_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking',
                   'area_per_bedroom', 'bed_bath_ratio', 'total_rooms',
                   'parking_per_room', 'luxury_score', 'area_per_story',
                   'amenities_count', 'luxury_area', 'stories_parking']
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating',
                       'airconditioning', 'prefarea', 'furnishingstatus']

# Step 6: Create preprocessing steps


In [32]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 7: Create model pipelines


In [33]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    ))
])

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

# Step 8: Split data


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train and evaluate models


In [35]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_test_exp = np.expm1(y_test)
    y_pred_exp = np.expm1(y_pred)
    print(f"\n{model.named_steps['regressor'].__class__.__name__} Performance:")
    print(f"Mean Absolute Error: ${mean_absolute_error(y_test_exp, y_pred_exp):,.2f}")
    print(f"Root Mean Squared Error: ${np.sqrt(mean_squared_error(y_test_exp, y_pred_exp)):,.2f}")
    print(f"R² Score: {r2_score(y_test_exp, y_pred_exp):.2f}")
    return model

# Evaluate models
print("Model Evaluation Results:")
rf_model = evaluate_model(rf_pipeline, X_train, y_train, X_test, y_test)
gb_model = evaluate_model(gb_pipeline, X_train, y_train, X_test, y_test)
xgb_model = evaluate_model(xgb_pipeline, X_train, y_train, X_test, y_test)

Model Evaluation Results:

RandomForestRegressor Performance:
Mean Absolute Error: $866,527.27
Root Mean Squared Error: $1,148,881.99
R² Score: 0.65

GradientBoostingRegressor Performance:
Mean Absolute Error: $884,443.51
Root Mean Squared Error: $1,161,678.30
R² Score: 0.64

XGBRegressor Performance:
Mean Absolute Error: $844,841.63
Root Mean Squared Error: $1,116,792.89
R² Score: 0.67


# Step 10: Select best model


In [36]:
models = {
    'RandomForest': rf_model,
    'GradientBoosting': gb_model,
    'XGBoost': xgb_model
}
best_model_name = max(models.keys(), key=lambda x: r2_score(np.expm1(y_test), np.expm1(models[x].predict(X_test))))
best_model = models[best_model_name]
print(f"\nSelected best model: {best_model_name}")


Selected best model: XGBoost


# Step 10.1: Cross-Validation

In [37]:
def perform_cross_validation(model, X, y, cv=5):
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred))), greater_is_better=False)
    r2_scorer = make_scorer(r2_score)
    mae_scores = cross_val_score(model, X, y, cv=cv, scoring=mae_scorer, n_jobs=-1)
    rmse_scores = cross_val_score(model, X, y, cv=cv, scoring=rmse_scorer, n_jobs=-1)
    r2_scores = cross_val_score(model, X, y, cv=cv, scoring=r2_scorer, n_jobs=-1)
    mae_scores = -mae_scores
    rmse_scores = -rmse_scores
    print(f"\nCross-Validation Results ({cv}-fold):")
    print(f"Mean Absolute Error: ${np.mean(mae_scores):,.2f} ± ${np.std(mae_scores):,.2f}")
    print(f"Root Mean Squared Error: ${np.mean(rmse_scores):,.2f} ± ${np.std(rmse_scores):,.2f}")
    print(f"R² Score: {np.mean(r2_scores):.2f} ± {np.std(r2_scores):.2f}")

# Perform cross-validation
print(f"\nPerforming cross-validation for {best_model_name}")
perform_cross_validation(xgb_pipeline, X, y)


Performing cross-validation for XGBoost

Cross-Validation Results (5-fold):
Mean Absolute Error: $0.24 ± $0.09
Root Mean Squared Error: $1,333,867.05 ± $664,495.58
R² Score: -10.27 ± 3.83


# Step 11: Define test input

In [38]:
test_input = {
    'area': 7420,
    'bedrooms': 4,
    'bathrooms': 2,
    'stories': 3,
    'mainroad': 'yes',
    'guestroom': 'no',
    'basement': 'no',
    'hotwaterheating': 'no',
    'airconditioning': 'yes',
    'parking': 2,
    'prefarea': 'yes',
    'furnishingstatus': 'furnished'
}

# Prepare test input
test_df = pd.DataFrame([test_input])
for col in categorical_columns:
    test_df[col] = test_df[col].astype('category')
test_df = create_features(test_df)

# Step 11.1: Prediction Interval for Test Input

In [39]:
def get_prediction_interval(model, X_train, y_train, test_df, n_iterations=50, alpha=0.95):
    predictions = []
    for seed in range(n_iterations):
        temp_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', XGBRegressor(
                n_estimators=500,
                learning_rate=0.01,
                max_depth=6,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed
            ))
        ])
        temp_pipeline.fit(X_train, y_train)
        log_pred = temp_pipeline.predict(test_df)
        pred_price = np.expm1(log_pred)[0]
        predictions.append(pred_price)
    predictions = np.array(predictions)
    lower_bound = np.percentile(predictions, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(predictions, (1 + alpha) / 2 * 100)
    mean_pred = np.mean(predictions)
    return mean_pred, lower_bound, upper_bound

# Calculate prediction interval
mean_pred, lower_bound, upper_bound = get_prediction_interval(xgb_pipeline, X_train, y_train, test_df)
print("\nTest Prediction with Prediction Interval:")
print(f"Predicted house price (mean): ${mean_pred:,.2f}")
print(f"95% Prediction Interval: ${lower_bound:,.2f} - ${upper_bound:,.2f}")
print(f"Actual house price: $13,300,000.00")
print(f"Difference from mean prediction: ${abs(13300000 - mean_pred):,.2f}")
print(f"Interval width: ${(upper_bound - lower_bound):,.2f}")

# Single prediction
log_pred = best_model.predict(test_df)
predicted_price = np.expm1(log_pred)[0]
print("\nTest Prediction:")
print(f"Predicted house price: ${predicted_price:,.2f}")
print(f"Actual house price: $13,300,000.00")
print(f"Difference: ${abs(13300000 - predicted_price):,.2f}")


Test Prediction with Prediction Interval:
Predicted house price (mean): $6,155,001.50
95% Prediction Interval: $5,991,950.00 - $6,306,432.50
Actual house price: $13,300,000.00
Difference from mean prediction: $7,144,998.50
Interval width: $314,482.50

Test Prediction:
Predicted house price: $6,279,592.50
Actual house price: $13,300,000.00
Difference: $7,020,407.50


# Step 11: Test with the specific example


In [40]:
print("\nTesting model on 5 test samples:")
sample_indices = np.random.choice(len(X_test), 5, replace=False)
for i, idx in enumerate(sample_indices, 1):
    test_sample = X_test.iloc[idx].to_dict()
    actual_price = np.expm1(y_test.iloc[idx])
    sample_df = pd.DataFrame([test_sample])
    for col in categorical_columns:
        sample_df[col] = sample_df[col].astype('category')
    sample_df = create_features(sample_df)
    log_pred = best_model.predict(sample_df)
    predicted_price = np.expm1(log_pred)[0]
    print(f"\nSample {i}:")
    print(f"Actual Price: ${actual_price:,.2f}")
    print(f"Predicted Price: ${predicted_price:,.2f}")
    print(f"Difference: ${abs(actual_price - predicted_price):,.2f}")
    print(f"Percentage Error: {abs(actual_price - predicted_price)/actual_price*100:.2f}%")


Testing model on 5 test samples:

Sample 1:
Actual Price: $6,440,000.00
Predicted Price: $4,107,690.00
Difference: $2,332,310.00
Percentage Error: 36.22%

Sample 2:
Actual Price: $2,450,000.00
Predicted Price: $3,010,826.75
Difference: $560,826.75
Percentage Error: 22.89%

Sample 3:
Actual Price: $5,600,000.00
Predicted Price: $4,275,939.50
Difference: $1,324,060.50
Percentage Error: 23.64%

Sample 4:
Actual Price: $3,675,000.00
Predicted Price: $3,380,311.50
Difference: $294,688.50
Percentage Error: 8.02%

Sample 5:
Actual Price: $6,790,000.00
Predicted Price: $6,041,185.00
Difference: $748,815.00
Percentage Error: 11.03%
