In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
df = pd.read_csv('../data/airbnb_cleaned.csv')


In [3]:
X = df.drop(columns=['price'])  # Drop 'price' since it's the target
y = df['price']  # Target variable


In [4]:
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding categorical variables


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [7]:
y_pred = rf_model.predict(X_test)


In [8]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R² Score: {r2}")


MAE: 1.4973792829081496
MSE: 254.62799401263118
R² Score: 0.9976765984411429


In [9]:
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance)


service fee                      9.975193e-01
lat                              2.998669e-04
long                             2.908681e-04
number of reviews                2.759900e-04
Construction year                2.411657e-04
                                     ...     
neighbourhood_Woodrow            1.981727e-11
neighbourhood_Rossville          7.448484e-12
neighbourhood_Fort Wadsworth     6.769255e-12
neighbourhood_Lighthouse Hill    5.993104e-12
neighbourhood_Glen Oaks          5.682394e-12
Length: 242, dtype: float64


In [11]:
# Removing "service fee" to test again the accuracy/efficiency of the model
df_cln = df
X = df_cln.drop(columns=['price'])  # 'price' is the target variable
X = df_cln.drop(columns=['price', 'service fee'])  # Removing 'service fee' as a predictor


In [13]:
print(X_train.select_dtypes(include=['object']).head())


      host_identity_verified neighbourhood group       neighbourhood  \
74288            unconfirmed           Manhattan             Midtown   
88555            unconfirmed            Brooklyn            Bushwick   
20675               verified            Brooklyn          Park Slope   
10881            unconfirmed            Brooklyn  Bedford-Stuyvesant   
93821            unconfirmed            Brooklyn  Bedford-Stuyvesant   

      cancellation_policy        room type  
74288            moderate     Private room  
88555            flexible     Private room  
20675            moderate  Entire home/apt  
10881              strict     Private room  
93821            moderate     Private room  


In [14]:
print(X_train.apply(lambda col: col.astype(str).str.contains('unconfirmed', na=False).sum()))


host_identity_verified            40296
neighbourhood group                   0
neighbourhood                         0
lat                                   0
long                                  0
instant_bookable                      0
cancellation_policy                   0
room type                             0
Construction year                     0
minimum nights                        0
number of reviews                     0
reviews per month                     0
review rate number                    0
calculated host listings count        0
availability 365                      0
dtype: int64


In [16]:
X_train["host_identity_verified"] = X_train["host_identity_verified"].map({"verified": 1, "unconfirmed": 0})
X_test["host_identity_verified"] = X_test["host_identity_verified"].map({"verified": 1, "unconfirmed": 0})


In [21]:
X_train["host_identity_verified"] = X_train["host_identity_verified"].map({"verified": 1, "unconfirmed": 0}).fillna(0)
X_test["host_identity_verified"] = X_test["host_identity_verified"].map({"verified": 1, "unconfirmed": 0}).fillna(0)
print(X_train.dtypes)


host_identity_verified            float64
neighbourhood group                object
neighbourhood                      object
lat                               float64
long                              float64
instant_bookable                     bool
cancellation_policy                object
room type                          object
Construction year                   int64
minimum nights                      int64
number of reviews                   int64
reviews per month                 float64
review rate number                float64
calculated host listings count      int64
availability 365                    int64
dtype: object


In [24]:
X_train.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 80435 entries, 74288 to 15795
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   host_identity_verified          80435 non-null  object 
 1   neighbourhood group             80435 non-null  object 
 2   neighbourhood                   80435 non-null  object 
 3   lat                             80435 non-null  float64
 4   long                            80435 non-null  float64
 5   instant_bookable                80435 non-null  bool   
 6   cancellation_policy             80435 non-null  object 
 7   room type                       80435 non-null  object 
 8   Construction year               80435 non-null  int64  
 9   minimum nights                  80435 non-null  int64  
 10  number of reviews               80435 non-null  int64  
 11  reviews per month               80435 non-null  float64
 12  review rate number              8

In [25]:
X_train = pd.get_dummies(X_train, drop_first=True)

In [26]:
# Define target variable (y) and features (X)
y = df_cln['price']
X = df_cln.drop(columns=['price', 'service fee'])  # Removed 'service fee'

# Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and Train the Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R² Score: {r2}")

ValueError: could not convert string to float: 'unconfirmed'

In [18]:
# Feature Importance Analysis
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

print("\nFeature Importances:")
print(importances)

AttributeError: 'RandomForestRegressor' object has no attribute 'estimators_'