In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('./data/cleaned_df.csv')

In [3]:
# Select relevant columns for modeling
df_model = df[['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'minimum_nights', 'availability_365', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'price']]

In [4]:
# Convert categorical variables into dummy variables
df_model = pd.get_dummies(df_model)

In [7]:
# Split the dataset into training and testing sets
X = df_model.drop('price', axis=1)
y = df_model['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 38885.049155693254
R-squared: 0.12100818229695798


The MSE value of 38885.05 indicates that, on average, the squared difference between the predicted and actual prices is quite high. Additionally, the R-squared value of 0.121 indicates that only 12.1% of the variance in the price can be explained by the model.

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the random forest regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 38624.62924541194
R-squared: 0.12689494276851387


With an MSE of 38624.63 and an R-squared value of 0.1269, the model's predictive performance is not optimal. The MSE indicates that, on average, the squared difference between the predicted and actual prices is still relatively high. Additionally, the R-squared value suggests that only around 12.7% of the variance in the price can be explained by the model.

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and train the gradient boosting regression model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 37029.26981247495
R-squared: 0.16295784916300549


The updated Mean Squared Error (MSE) of 37029.27 and R-squared value of 0.1629 indicate an improvement in the performance of the gradient boosting model compared to the previous models. This suggests that the gradient boosting algorithm was able to capture more of the variance in the target variable and make better predictions.

In [12]:
from xgboost import XGBRegressor
# Initialize and train the XGBoost regression model
model = XGBRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 44890.16611533353
R-squared: -0.014736757891724794


The Mean Squared Error (MSE) of 44890.17 and R-squared value of -0.0147 indicate that the model is not performing well and is not able to explain the variance in the target variable.