<center><h1>Modeling & Prediction</h1></center>  
<center><h6>AI-Powered Price Optimization</h6></center>  

This notebook represents the third stage of our **Airbnb Smart Pricing project** ETL pipeline. Having prepared and cleaned our dataset, we now focus on building a machine learning model that predicts the optimal nightly price for Airbnb listings. The goal is to develop a model that provides pricing recommendations with real business value. These predictions will feed into the final dashboard and business pitch.


In [1]:
# Load necessary libraries
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import (LinearRegression, Ridge)

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, make_scorer

In [2]:
# Load processed dataset
airbnb = pd.read_csv("../data/processed/airbnb.csv")
airbnb.head(5)

Unnamed: 0,id,host_id,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,...,neighbourhood_cleansed_South Kohala,neighbourhood_cleansed_South Kona,neighbourhood_cleansed_Waianae,neighbourhood_cleansed_Wailuku-Kahului,neighbourhood_cleansed_Waimea-Kekaha,season_Spring,season_Summer,season_Winter,price_diff_from_calendar,price_ratio_calendar
0,5269,7620,20.0274,-155.702,2,1.0,1.0,1.0,128.0,3,...,True,False,False,False,False,False,False,True,-57.0,0.691892
1,7896,21844,20.75684,-156.45631,4,2.0,1.0,2.0,211.0,4,...,False,False,False,False,False,False,False,True,11.0,1.055
2,218378,547507,20.9593,-156.68347,4,1.0,1.0,2.0,259.0,2,...,False,False,False,False,False,False,True,False,-16.0,0.941818
3,266804,1399029,20.88678,-156.50198,4,1.0,2.0,2.0,228.0,7,...,False,False,False,True,False,False,False,True,-151.0,0.601583
4,13899,54660,20.94837,-156.69075,4,1.0,1.0,2.0,280.0,7,...,False,False,False,False,False,False,True,False,-5.0,0.982456


In [3]:
# Load preprocessed dataset
airbnb_preprocessed = pd.read_csv("../data/processed/airbnb_preprocessed.csv")
airbnb_preprocessed.head(5)

Unnamed: 0,id,host_id,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,...,neighbourhood_cleansed_South Kohala,neighbourhood_cleansed_South Kona,neighbourhood_cleansed_Waianae,neighbourhood_cleansed_Wailuku-Kahului,neighbourhood_cleansed_Waimea-Kekaha,season_Spring,season_Summer,season_Winter,price_diff_from_calendar,price_ratio_calendar
0,5269,7620,-1.899539,1.493498,-1.485606,-0.673958,-0.332557,-1.326195,128.0,0.402393,...,1,0,0,0,0,0,0,1,-0.77122,-1.363606
1,7896,21844,-0.705963,0.879013,-0.128143,1.062898,-0.332557,-0.219855,211.0,1.03071,...,0,0,0,0,0,0,0,1,0.209446,0.185083
2,218378,547507,-0.374679,0.693961,-0.128143,-0.673958,-0.332557,-0.219855,259.0,-0.225925,...,0,0,0,0,0,0,1,0,-0.179936,-0.297647
3,266804,1399029,-0.493343,0.841809,-0.128143,-0.673958,0.96887,-0.219855,228.0,2.915661,...,0,0,0,1,0,0,0,1,-2.126846,-1.748781
4,13899,54660,-0.392564,0.68803,-0.128143,-0.673958,-0.332557,-0.219855,280.0,2.915661,...,0,0,0,0,0,0,1,0,-0.021299,-0.124323


In [5]:
# Define target and features
columns_to_drop = ['price', 'id', 'host_id']
X = airbnb_preprocessed.drop(columns=columns_to_drop)
y = airbnb_preprocessed[target]

In [6]:
# Train-test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (7012, 136)
y_train shape: (7012,)
X_val shape: (1403, 136)
y_val shape: (1403,)
X_test shape: (351, 136)
y_test shape: (351,)


#### Regression Models

In [7]:
# Define models
models = [
    LinearRegression(),
    Ridge(),
    RandomForestRegressor()
]

train_scores = {}
val_scores = {}

# Train and evaluate
for model in tqdm(models, desc="Training models", unit="model"):
    name = model.__class__.__name__
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    train_scores[name] = {
        'MAPE': mean_absolute_percentage_error(y_train, y_train_pred),
        'R2': r2_score(y_train, y_train_pred),
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'CV_MAPE': -cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=5).mean()
    }

    val_scores[name] = {
        'MAPE': mean_absolute_percentage_error(y_val, y_val_pred),
        'R2': r2_score(y_val, y_val_pred),
        'MAE': mean_absolute_error(y_val, y_val_pred),
        'RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred))
    }
    

# Convert to DataFrame for comparison
train_scores_df = pd.DataFrame(train_scores).T
val_scores_df = pd.DataFrame(val_scores).T

print("Training Scores:\n", train_scores_df)
print("\nValidation Scores:\n", val_scores_df)

Training models: 100%|████████████████████████████████████████████████████████████████| 3/3 [02:20<00:00, 46.97s/model]

Training Scores:
                                MAPE        R2           MAE          RMSE  \
LinearRegression       1.072802e-15  1.000000  2.455932e-13  3.091700e-13   
Ridge                  1.444347e-04  1.000000  3.406324e-02  6.701214e-02   
RandomForestRegressor  3.093962e-03  0.999853  7.988534e-01  1.607580e+00   

                            CV_MAPE  
LinearRegression       6.957229e-16  
Ridge                  1.848292e-04  
RandomForestRegressor  8.952182e-03  

Validation Scores:
                            MAPE        R2       MAE      RMSE
LinearRegression       0.000008  1.000000  0.000549  0.020566
Ridge                  0.000137  1.000000  0.033202  0.045875
RandomForestRegressor  0.007977  0.999134  1.957463  3.892752





#### Hyperparameter Tuning

In [8]:
# Define models
models = [
    # LinearRegression(),
    Ridge(),
    RandomForestRegressor(random_state=42)
]

# Define hyperparameter grids
param_grids = {
    # Best Parameters for Ridge: {'alpha': 1.0}
    'Ridge': {
        'alpha': [0.01, 0.1, 1.0, 10.0]
    },
    # Best Parameters for RandomForestRegressor: 
    # {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3,
    # 'n_estimators': 150}
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, None],
        'min_samples_split': [2, 3],
        'min_samples_leaf': [1, 2]
    }
    # LinearRegression has no hyperparameters worth tuning
}

best_scores = {}

# Train and evaluate
for model in tqdm(models, desc="Training models", unit="model"):
    model_name = model.__class__.__name__

    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"\n Best Parameters for {model_name}: {best_params}")
   
    y_train_pred = best_model.predict(X_train)

    best_scores[model_name] = {
        'MAPE': mean_absolute_percentage_error(y_train, y_train_pred),
        'R2': r2_score(y_train, y_train_pred),
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred))
    }

# Convert to DataFrame for comparison
best_scores_df = pd.DataFrame(best_scores).T

print("\n Best Training Scores:\n", best_scores_df)

Training models:   0%|                                                                        | 0/2 [00:00<?, ?model/s]

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Training models:  50%|████████████████████████████████                                | 1/2 [00:03<00:03,  3.53s/model]


 Best Parameters for Ridge: {'alpha': 0.01}
Fitting 5 folds for each of 24 candidates, totalling 120 fits

 Best Parameters for RandomForestRegressor: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


Training models: 100%|███████████████████████████████████████████████████████████████| 2/2 [05:30<00:00, 165.34s/model]


 Best Training Scores:
                            MAPE        R2       MAE      RMSE
Ridge                  0.000002  1.000000  0.000495  0.002017
RandomForestRegressor  0.002987  0.999862  0.762679  1.561429





In [9]:
# Define the best model
models = [
    Ridge(alpha=0.01)
]

test_scores = {}

# Test and evaluate
for model in tqdm(models, desc="Testing models", unit="model"):
    name = model.__class__.__name__
    model.fit(X_train, y_train)

    y_test_pred = model.predict(X_test)
    
    test_scores[name] = {
        'MAPE': mean_absolute_percentage_error(y_test, y_test_pred),
        'R2': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred))
    }

# Convert to DataFrame for comparison
test_scores_df = pd.DataFrame(test_scores).T

print("Testing Scores:\n", test_scores_df)

Testing models: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.98model/s]

Testing Scores:
            MAPE   R2       MAE      RMSE
Ridge  0.000002  1.0  0.000478  0.000727





In [10]:
# Choose best model based on R²
best_model = models[0]

# For Ridge
coef = pd.Series(best_model.coef_, index=X.columns)
top_coef = coef.abs().sort_values(ascending=False).head(10)

top_coef

avg_price_calendar                          79.405827
price_delta_from_area_avg                   53.809988
avg_price_neighborhood                      50.143359
price_diff_from_calendar                    41.280981
neighbourhood_cleansed_Koloa-Poipu           2.236674
neighbourhood_cleansed_South Kohala          2.080871
neighbourhood_cleansed_Lahaina               1.976399
neighbourhood_cleansed_Kihei-Makena          1.694282
neighbourhood_cleansed_Ewa                   1.604098
neighbourhood_cleansed_North Shore Kauai     1.530333
dtype: float64

In [11]:
# Predicting price for one listing (just as a sample)
sample = X_test.iloc[[9]]
predicted_price = best_model.predict(sample)[0]
print(f"Predicted Price: ${predicted_price:.2f}")

Predicted Price: $223.00


#### Preparing data for visualization

In [33]:
# Use test indices to get corresponding original values
original_test_data = airbnb.loc[X_test.index]

# Create output DataFrame
df_results = original_test_data.copy()
df_results['predicted_price'] = y_test_pred
df_results['error'] = abs(df_results['predicted_price'] - df_results['price'])
df_results["error_pct"] = df_results["error"] / df_results["price"] * 100
df_results

Unnamed: 0,id,host_id,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,...,neighbourhood_cleansed_Wailuku-Kahului,neighbourhood_cleansed_Waimea-Kekaha,season_Spring,season_Summer,season_Winter,price_diff_from_calendar,price_ratio_calendar,predicted_price,error,error_pct
6660,966096905792793006,42612930,20.712165,-156.444286,4,2.0,1.0,2.0,349.0,1,...,False,False,False,False,True,81.0,1.302239,349.000091,0.000091,0.000026
8764,1365607947371843759,666359230,21.274725,-157.820931,4,1.0,1.0,1.0,185.0,2,...,False,False,True,False,False,-25.0,0.880952,185.000211,0.000211,0.000114
3777,54051915,1192698,22.222580,-159.491290,2,1.0,1.0,1.0,237.0,4,...,False,False,False,False,True,-76.0,0.757188,237.000012,0.000012,0.000005
19,200441,980100,20.709860,-156.438280,2,1.0,1.0,2.0,280.0,1,...,False,False,False,True,False,86.0,1.443299,280.000850,0.000850,0.000304
2807,48317896,12747165,20.758360,-156.458010,2,1.0,0.0,1.0,166.0,4,...,False,False,False,False,True,-133.0,0.555184,165.999908,0.000092,0.000055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670,33642835,54660,20.879340,-156.683440,4,1.0,1.0,2.0,280.0,7,...,False,False,False,False,True,51.0,1.222707,280.000763,0.000763,0.000272
5649,811643168070085199,457152913,22.222363,-159.477974,4,1.0,1.0,1.0,224.0,2,...,False,False,False,False,False,15.0,1.071770,223.999976,0.000024,0.000011
736,19048407,31214940,21.871360,-159.443740,6,2.0,2.0,3.0,301.0,2,...,False,False,False,False,True,82.0,1.374429,301.000141,0.000141,0.000047
3145,50247407,97006413,21.872780,-159.449080,4,2.0,2.0,3.0,431.0,1,...,False,False,False,False,True,131.0,1.436667,430.999861,0.000139,0.000032


In [34]:
# Put together neighbourhood_cleansed
neighbourhood_cleansed_cols = [col for col in df_results.columns if col.startswith('neighbourhood_cleansed_')]
df_results['neighbourhood_cleansed'] = df_results[neighbourhood_cleansed_cols].idxmax(axis=1).str.replace('neighbourhood_cleansed_', '')
df_results.drop(columns=neighbourhood_cleansed__cols, inplace=True)

# Put together season
season_cols = [col for col in df_results.columns if col.startswith('season_')]
df_results['season'] = df_results[season_cols].idxmax(axis=1).str.replace('season_', '')
df_results.drop(columns=season_cols, inplace=True)

# Put together room_type
room_type_cols = [col for col in df_results.columns if col.startswith('room_type_')]
df_results['room_type'] = df_results[room_type_cols].idxmax(axis=1).str.replace('room_type_', '')
df_results.drop(columns=room_type_cols, inplace=True)

# Put together property
property_cols = [col for col in df_results.columns if col.startswith('property_')]
df_results['property'] = df_results[property_cols].idxmax(axis=1).str.replace('property_', '')
df_results.drop(columns=property_cols, inplace=True)

In [36]:
# Save for next steps
df_results.to_csv("../data/results/test_results.csv", index=False)

print("Preprocessing complete. Saved results with shape:", df_results.shape)

Preprocessing complete. Saved results with shape: (351, 31)
