In [1]:
# Import required libraries
import sqlite3
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


In [2]:
# Create a connection to the SQL database
conn = sqlite3.connect('../data/airbnb_cleaned.db')

# Load data from the 'airbnb_cleaned' table
query = "SELECT * FROM airbnb_cleaned"
df = pd.read_sql(query, conn)

# Close the connection
conn.close()

In [3]:
# Create a copy of the DataFrame
df_class = df.copy()

In [4]:
# Map numeric ratings to categorical labels
def categorize_reviews(rating):
    """Assign category labels based on numeric review ratings."""
    if rating >= 4.5:
        return "A"
    elif rating >= 4.0:
        return "B"
    elif rating >= 3.5:
        return "C"
    elif rating >= 3.0:
        return "D"
    return "F"

# Apply the categorization function to the DataFrame
df_class['review_category'] = df_class['review rate number'].apply(categorize_reviews)

In [5]:
# Filter out rows where 'review_category' is 'F'
df_filtered = df_class.loc[df_class['review_category'] != 'F'].copy()

# Display the count of remaining categories with a cleaner format
remaining_counts = df_filtered['review_category'].value_counts()

print("Remaining review categories:")
for category, count in remaining_counts.items():
    print(f"- {category}: {count}")

Remaining review categories:
- A: 22921
- B: 22879
- D: 22842


In [6]:
# Calculate the 95th percentile for price and filter out high-price outliers
high_price_threshold = np.percentile(df_filtered['price'], 95)
df_filtered = df_filtered.loc[df_filtered['price'] <= high_price_threshold].copy()

# Display a summary of price statistics with improved formatting
price_stats = df_filtered['price'].describe()
print("Price Statistics (after removing outliers):")
print(price_stats.to_string())

Price Statistics (after removing outliers):
count    65277.000000
mean       596.554606
std        314.228309
min         50.000000
25%        326.000000
50%        595.000000
75%        870.000000
max       1143.000000


In [7]:
# Define features (X) and target variable (y)
X = df_filtered.loc[:, ~df_filtered.columns.isin(['review rate number', 'review_category'])]
y = df_filtered['review_category']

# Display shapes of X and y for verification
print("Features (X) shape:", X.shape)
print("Target variable (y) shape:", y.shape)

Features (X) shape: (65277, 16)
Target variable (y) shape: (65277,)


In [8]:
# Encode categorical features using One-Hot Encoding
X_encoded = pd.get_dummies(X, drop_first=True)

# Display the shape of the DataFrame after encoding
print("Shape of X before encoding:", X.shape)
print("Shape of X after encoding:", X_encoded.shape)

# Replace the original DataFrame with the encoded one
X = X_encoded

Shape of X before encoding: (65277, 16)
Shape of X after encoding: (65277, 240)


In [9]:
# Encode target variable using LabelEncoder
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Split features and target variable into training and test sets
X_train, X_test = train_test_split(
    X, test_size=0.2, random_state=42, stratify=y_numeric)

y_train, y_test = train_test_split(
    y_numeric, test_size=0.2, random_state=42, stratify=y_numeric)

In [10]:
# Initialize and train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr.predict(X_test)

# Round predictions and map back to original categories
y_pred_lr_rounded = np.round(y_pred_lr).astype(int)
y_pred_lr_categorized = label_encoder.inverse_transform(y_pred_lr_rounded)

# Evaluate and display model performance
print("\nLinear Regression Results:")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_lr):.4f}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_lr):.4f}")
print(f"R-Squared Score: {r2_score(y_test, y_pred_lr):.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(label_encoder.inverse_transform(y_test), y_pred_lr_categorized))


Linear Regression Results:
Mean Absolute Error (MAE): 0.6801
Mean Squared Error (MSE): 0.6696
R-Squared Score: -0.0025

Classification Report:
              precision    recall  f1-score   support

           A       0.67      0.00      0.00      4380
           B       0.33      1.00      0.50      4335
           D       0.53      0.00      0.00      4341

    accuracy                           0.33     13056
   macro avg       0.51      0.33      0.17     13056
weighted avg       0.51      0.33      0.17     13056



In [11]:
# Initialize and train the LightGBM model
lgbm = LGBMClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=10, random_state=42
)
lgbm.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lgbm = lgbm.predict(X_test)

# Evaluate and display model performance
accuracy = accuracy_score(y_test, y_pred_lgbm)
print(f"LightGBM Accuracy: {accuracy:.4f}")

print("\nLightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgbm))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2237
[LightGBM] [Info] Number of data points in the train set: 52221, number of used features: 168
[LightGBM] [Info] Start training from score -1.092142
[LightGBM] [Info] Start training from score -1.102469
[LightGBM] [Info] Start training from score -1.101258
LightGBM Accuracy: 0.4630

LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.49      0.47      4380
           1       0.46      0.46      0.46      4335
           2       0.47      0.45      0.46      4341

    accuracy                           0.46     13056
   macro avg       0.46      0.46      0.46     13056
weighted avg       0.46      0.46      0.46     13056



In [12]:
# Initialize and train the XGBoost model
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

xgb = XGBClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=10, random_state=42
)
xgb.fit(X_train, y_train_encoded)

# Make predictions and convert back to original labels
y_pred_xgb = xgb.predict(X_test)
y_pred_xgb_labels = label_encoder.inverse_transform(y_pred_xgb)

# Evaluate and display model performance
accuracy = accuracy_score(y_test, y_pred_xgb_labels)
print(f"XGBoost Accuracy: {accuracy:.4f}")

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb_labels))

XGBoost Accuracy: 0.5335

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.54      0.53      4380
           1       0.53      0.52      0.53      4335
           2       0.53      0.54      0.54      4341

    accuracy                           0.53     13056
   macro avg       0.53      0.53      0.53     13056
weighted avg       0.53      0.53      0.53     13056



In [13]:
# Initialize and train the Random Forest model
rf = RandomForestClassifier(
    n_estimators=300, random_state=42
)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate and display model performance
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"\nRandom Forest Accuracy: {accuracy:.4f}")

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.6161

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.63      0.62      4380
           1       0.62      0.60      0.61      4335
           2       0.62      0.62      0.62      4341

    accuracy                           0.62     13056
   macro avg       0.62      0.62      0.62     13056
weighted avg       0.62      0.62      0.62     13056



In [14]:
# Initialize and train the Optimized Random Forest model
rf_optimized = RandomForestClassifier(
    n_estimators=300, max_depth=15, min_samples_split=5, random_state=42
)
rf_optimized.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf_opt = rf_optimized.predict(X_test)

# Evaluate and display model performance
accuracy = accuracy_score(y_test, y_pred_rf_opt)
print(f"Optimized Random Forest Accuracy: {accuracy:.4f}")

print("\nOptimized Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf_opt))

Optimized Random Forest Accuracy: 0.4611

Optimized Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.57      0.49      4380
           1       0.48      0.41      0.44      4335
           2       0.49      0.41      0.44      4341

    accuracy                           0.46     13056
   macro avg       0.47      0.46      0.46     13056
weighted avg       0.47      0.46      0.46     13056



In [15]:
# Perform Feature Importance Analysis
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display the top 10 most important features
print("Top 10 Features by Importance:")
print(feature_importance_df.head(10))

Top 10 Features by Importance:
                          Feature  Importance
1                            long    0.094528
0                             lat    0.093926
3                           price    0.091898
4                     service fee    0.087338
7               reviews per month    0.082806
9                availability 365    0.079099
6               number of reviews    0.075178
2               Construction year    0.074044
5                  minimum nights    0.055332
8  calculated host listings count    0.039569


In [16]:
# Select most important features based on threshold
important_features = feature_importance_df.loc[
    feature_importance_df['Importance'] >= 0.02, 'Feature'
].tolist()

# Create a new DataFrame with the selected features
X_optimized = X[important_features]

In [17]:
# Retrain the model using the selected features
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(
    X_optimized, y, test_size=0.2, random_state=42, stratify=y
)

rf.fit(X_train_opt, y_train_opt)

# Make predictions on the test set with the optimized feature set
y_pred_opt = rf.predict(X_test_opt)

# Evaluate and display model performance
accuracy = accuracy_score(y_test_opt, y_pred_opt)
print(f"\nNew Optimized Feature Set Accuracy: {accuracy:.4f}")

print("\nOptimized Feature Set Classification Report:")
print(classification_report(y_test_opt, y_pred_opt))


New Optimized Feature Set Accuracy: 0.6500

Optimized Feature Set Classification Report:
              precision    recall  f1-score   support

           A       0.64      0.66      0.65      4380
           B       0.65      0.64      0.64      4335
           D       0.66      0.65      0.66      4341

    accuracy                           0.65     13056
   macro avg       0.65      0.65      0.65     13056
weighted avg       0.65      0.65      0.65     13056



In [18]:
# Apply SMOTE to handle class imbalance in the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [19]:
# Initialize and train the Random Forest model on balanced data
rf_smote = RandomForestClassifier(
    n_estimators=100, random_state=42
)
rf_smote.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_smote = rf_smote.predict(X_test)

# Evaluate and display model performance
accuracy = accuracy_score(y_test, y_pred_smote)
print(f"\nBalanced Data Accuracy: {accuracy:.4f}")

print("\nBalanced Data Classification Report:")
print(classification_report(y_test, y_pred_smote))


Balanced Data Accuracy: 0.6096

Balanced Data Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.63      0.62      4380
           1       0.61      0.59      0.60      4335
           2       0.62      0.60      0.61      4341

    accuracy                           0.61     13056
   macro avg       0.61      0.61      0.61     13056
weighted avg       0.61      0.61      0.61     13056



In [20]:
# List of model names and their corresponding calculated accuracy scores
model_names = [
    'Linear Regression', 
    'LightGBM', 
    'XGBoost', 
    'Random Forest', 
    'Optimized Random Forest', 
    'Balanced Random Forest', 
    'Optimized Feature Set'
]

# Corresponding pre-calculated accuracy scores
accuracies = [
    accuracy_score(label_encoder.inverse_transform(y_test), y_pred_lr_categorized),
    accuracy_score(y_test, y_pred_lgbm),
    accuracy_score(y_test, y_pred_xgb_labels),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_rf_opt),
    accuracy_score(y_test, y_pred_smote),
    accuracy_score(y_test_opt, y_pred_opt)
]

In [21]:
# Combine names and accuracies into a DataFrame
comparison_df = pd.DataFrame({'Model': model_names, 'Accuracy': accuracies})

# Sort models by accuracy in descending order
comparison_df = comparison_df.sort_values(by='Accuracy', ascending=False)

# Display the sorted comparison table
print("\nModel Performance Comparison (sorted by accuracy):")
print(comparison_df.to_string(index=False))


Model Performance Comparison (sorted by accuracy):
                  Model  Accuracy
  Optimized Feature Set  0.650046
          Random Forest  0.616115
 Balanced Random Forest  0.609605
                XGBoost  0.533471
               LightGBM  0.463006
Optimized Random Forest  0.461091
      Linear Regression  0.000000
