In [33]:
import os
import csv
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#load datasets

airbnb_crime = pd.read_csv('cleaned_airbnb_crime.csv')


In [34]:
airbnb_crime.dtypes

id                                                  int64
last_review                                        object
reviews_per_month                                 float64
calculated_host_listings_count                      int64
availability_365                                    int64
neighbourhood_group_cleansed                       object
neighbourhood_cleansed                             object
latitude                                          float64
longitude                                         float64
price                                             float64
minimum_nights                                      int64
number_of_reviews                                   int64
room_type                                          object
bedrooms                                          float64
bathrooms                                         float64
beds                                              float64
review_scores_rating                              float64
review_scores_

In [35]:
X = airbnb_crime[['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'availability_365', 'crime_count',
               'calculated_host_listings_count', 'distance_to_statue_of_liberty', 'distance_to_times_square', 'neighbourhood_group_cleansed', 'room_type','bathrooms', 'bedrooms','reviews_per_month', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value']]
y = airbnb_crime['price']

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
from sklearn.preprocessing import MinMaxScaler

# Select only numeric columns
numeric_features = ['latitude', 'longitude', 'minimum_nights', 
                    'number_of_reviews', 'availability_365', 'crime_count',
                    'calculated_host_listings_count', 
                    'distance_to_statue_of_liberty', 'distance_to_times_square','bathrooms', 'bedrooms','reviews_per_month', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value']

scaler = MinMaxScaler()
scaler.fit(X_train[numeric_features])

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Replace numeric columns with scaled versions
X_train_scaled[numeric_features] = scaler.transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])


In [38]:
# Put transformed data into a DataFrame (only numeric features)
X_train_transformed = pd.DataFrame(scaler.transform(X_train[numeric_features]), columns=numeric_features, index=X_train.index)
X_test_transformed = pd.DataFrame(scaler.transform(X_test[numeric_features]), columns=numeric_features, index=X_test.index)

In [39]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(drop='first')
oh_encoder.fit(airbnb_crime.loc[X_train.index, ['neighbourhood_group_cleansed']])

X_train_encoded = pd.DataFrame.sparse.from_spmatrix(
    oh_encoder.transform(airbnb_crime.loc[X_train.index, ['neighbourhood_group_cleansed']]),
    columns=oh_encoder.get_feature_names_out(),
    index=X_train.index
)
X_test_encoded = pd.DataFrame.sparse.from_spmatrix(
    oh_encoder.transform(airbnb_crime.loc[X_test.index, ['neighbourhood_group_cleansed']]),
    columns=oh_encoder.get_feature_names_out(),
    index=X_test.index
)

X_train_encoded = pd.concat([X_train_encoded, X_train_transformed], axis=1)
X_test_encoded = pd.concat([X_test_encoded, X_test_transformed], axis=1)

In [40]:
# Define the mapping
room_type_map = {
    'Entire home/apt': 4,
    'Hotel room': 3,
    'Private room': 2,
    'Shared room': 1
}

# Apply the mapping to training and test sets
X_train_label = X_train[['room_type']].replace(room_type_map)
X_test_label = X_test[['room_type']].replace(room_type_map)

# Combine with your other encoded features

X_train_t = pd.concat(
    [X_train_encoded.reset_index(drop=True),
     X_train_label.reset_index(drop=True)],
    axis=1
)

X_test_t = pd.concat(
    [X_test_encoded.reset_index(drop=True),
     X_test_label.reset_index(drop=True)],
    axis=1
)



  X_train_label = X_train[['room_type']].replace(room_type_map)
  X_test_label = X_test[['room_type']].replace(room_type_map)


In [41]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor


knn = KNeighborsRegressor(n_neighbors=160)
knn.fit(X_train_t, y_train)
y_pred = knn.predict(X_test_t)
knn.score(X_test_t, y_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("KNN Regression Results")
print("Score: {}".format(knn.score(X_test_t, y_test)))
print("Mean Squared Error: {}".format(mse))
print("Mean Absolute Error: {}".format(mae))
print("Root Mean Absolute Error: {}".format(rmse))
print("R2 score: {}".format(r2))



KNN Regression Results




Score: 0.3447345240481764
Mean Squared Error: 17075.58854057597
Mean Absolute Error: 47.85410683419759
Root Mean Absolute Error: 130.6735954222427
R2 score: 0.3447345240481764


In [42]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_t, y_train)

y_pred_lr = lr.predict(X_test_t)

# Metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)   # same as lr.score(X_test_t, y_test)

# Print results
print("Linear Regression Results")
print("Score: {}".format(lr.score(X_test_t, y_test)))
print("Mean Squared Error: {}".format(mse_lr))
print("Mean Absolute Error: {}".format(mae_lr)) 
print("Root Mean Squared Error: {}".format(rmse_lr))
print("R2 score: {}".format(r2_lr))



Linear Regression Results
Score: 0.36130604903851615
Mean Squared Error: 16643.75052589363
Mean Absolute Error: 52.798596741623456
Root Mean Squared Error: 129.01066051258567
R2 score: 0.36130604903851615




In [43]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor(
    max_depth=10, # 30, 50, 100 - we can have many more splits when we have continuous variables
    min_samples_split=2,
    max_leaf_nodes=30 # basically equivalent to max_depth but still considered
)


dt_reg.fit(X_train_t, y_train)

y_pred_dt = dt_reg.predict(X_test_t)

r2_dt = dt_reg.score(X_test_t, y_test)

mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)

print("Decision Tree Regression Results")
print("Score: {}".format(dt_reg.score(X_test_t, y_test)))
print("Mean Squared Error: {}".format(mse_dt))
print("Mean Absolute Error: {}".format(mae_dt)) 
print("Root Mean Absolute Error: {}".format(rmse_dt))
print("R2 score: {}".format(r2_dt))



Decision Tree Regression Results
Score: -0.5838378851783064
Mean Squared Error: 41273.293092384985
Mean Absolute Error: 54.315465468123804
Root Mean Absolute Error: 203.15829565239267
R2 score: -0.5838378851783064


In [44]:
from sklearn.ensemble import RandomForestRegressor

rf_boot = RandomForestRegressor(n_estimators=50, max_depth=10, bootstrap=True, random_state=42)
rf_boot.fit(X_train_t, y_train)

y_pred_rf = rf_boot.predict(X_test_t)

# Metrics
r2_rf = rf_boot.score(X_test_t, y_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

# Results
print("Random Forest Regression Results")
print(f"R² Score: {r2_rf:.4f}")
print(f"Mean Squared Error: {mse_rf:.4f}")
print(f"Mean Absolute Error: {mae_rf:.4f}")
print(f"Root Mean Squared Error: {rmse_rf:.4f}")




Random Forest Regression Results
R² Score: 0.3814
Mean Squared Error: 16119.3882
Mean Absolute Error: 45.4416
Root Mean Squared Error: 126.9622




In [45]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Drop columns not useful for prediction (IDs, text)
X = airbnb_crime.drop(columns=['id','price','last_review'])  # keep features
y = airbnb_crime['price']  # target variable

# Handle missing values (replace NaN with median)
X = X.fillna(X.median(numeric_only=True))

# Convert categorical variables to dummies (one-hot encoding)
X = pd.get_dummies(X, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define model
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("XGBoost Regression Results")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")




XGBoost Regression Results
RMSE: 114.24
R² Score: 0.499


In [46]:
# Transform price with log1p (handles zeros safely)
y_log = np.log1p(airbnb_crime['price'])

from sklearn.model_selection import train_test_split

X = airbnb_crime.drop(columns=['id','price','last_review'])
X = X.fillna(X.median(numeric_only=True))
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

y_pred_log = model.predict(X_test)

# Convert predictions back to original scale
y_pred = np.expm1(y_pred_log)

# If you want to evaluate
from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred))
r2 = r2_score(np.expm1(y_test), y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")



RMSE: 106.90
R² Score: 0.561
