Loading data

In [31]:
import os
import csv
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#load datasets

airbnb_crime = pd.read_csv(r'C:\Madhuri\projects\ML_project\Test_ML\cleaned_airbnb_crime - Final.csv')


In [32]:
airbnb_crime.dtypes

id                                                  int64
last_review                                        object
reviews_per_month                                 float64
calculated_host_listings_count                      int64
availability_365                                    int64
neighbourhood_group_cleansed                       object
neighbourhood_cleansed                             object
latitude                                          float64
longitude                                         float64
price                                             float64
minimum_nights                                      int64
number_of_reviews                                   int64
room_type                                          object
bedrooms                                          float64
bathrooms                                         float64
beds                                              float64
review_scores_rating                              float64
review_scores_

Spling

In [33]:
from sklearn.model_selection import train_test_split
X = airbnb_crime.select_dtypes(include=[np.number]).drop(columns=['price'])
y = airbnb_crime['price']

KNN

In [34]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsRegressor(n_neighbors=10) # n_neighbours is a "hyperparameter", which can be changed to improve performance of the model
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('K-Nearest Neighbors Regressor Performance:')
print(f'R²: {r2:.4f}')
print(f'MAE: {mae:.4f}')    
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')

K-Nearest Neighbors Regressor Performance:
R²: -0.0267
MAE: 79.6019
MSE: 26753.8188
RMSE: 163.5659


Linear regression

In [35]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)


print('\nLinear Regression Performance:')
print(f'R²: {r2_lr:.4f}')
print(f'MAE: {mae_lr:.4f}')
print(f'MSE: {mse_lr:.4f}')
print(f'RMSE: {rmse_lr:.4f}')



Linear Regression Performance:
R²: 0.3334
MAE: 57.9722
MSE: 17370.8100
RMSE: 131.7984


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Spliting by selected columns

In [37]:
X = airbnb_crime[['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'availability_365', 'crime_count',
               'calculated_host_listings_count', 'distance_to_statue_of_liberty', 'distance_to_times_square', 'neighbourhood_group_cleansed', 'room_type','bathrooms', 'bedrooms','reviews_per_month', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value']]
y = airbnb_crime['price']

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scalling

In [39]:
from sklearn.preprocessing import MinMaxScaler

# Select only numeric columns
numeric_features = ['latitude', 'longitude', 'minimum_nights', 
                    'number_of_reviews', 'availability_365', 'crime_count',
                    'calculated_host_listings_count', 
                    'distance_to_statue_of_liberty', 'distance_to_times_square','bathrooms', 'bedrooms','reviews_per_month', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value']

scaler = MinMaxScaler()
scaler.fit(X_train[numeric_features])

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Replace numeric columns with scaled versions
X_train_scaled[numeric_features] = scaler.transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])


In [40]:
# Put transformed data into a DataFrame (only numeric features)
X_train_transformed = pd.DataFrame(scaler.transform(X_train[numeric_features]), columns=numeric_features, index=X_train.index)
X_test_transformed = pd.DataFrame(scaler.transform(X_test[numeric_features]), columns=numeric_features, index=X_test.index)

Target encoding

In [41]:
import pandas as pd

# Target encoding function with smoothing
def target_encode(train_series, target_series, smoothing=10):
    """
    train_series: categorical column from training data
    target_series: target column from training data (e.g., price)
    smoothing: higher value -> more smoothing towards global mean
    """
    global_mean = target_series.mean()
    
    # Aggregate mean and count per category
    agg = train_series.to_frame().join(target_series).groupby(train_series.name)['price'].agg(['mean','count'])
    
    # Smoothed mean
    smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
    
    # Map the original column to its smoothed mean
    return train_series.map(smooth), smooth, global_mean

# Apply target encoding to training data
train_encoded, encoding_map, global_mean = target_encode(
    airbnb_crime.loc[X_train.index, 'neighbourhood_group_cleansed'],
    airbnb_crime.loc[X_train.index, 'price'],
    smoothing=10
)

# Add encoded column to your training features
X_train_encoded = X_train_transformed.copy()
X_train_encoded['neighbourhood_group_cleansed_enc'] = train_encoded

# Apply the encoding to test data
test_encoded = airbnb_crime.loc[X_test.index, 'neighbourhood_group_cleansed'].map(encoding_map).fillna(global_mean)
X_test_encoded = X_test_transformed.copy()
X_test_encoded['neighbourhood_group_cleansed_enc'] = test_encoded



Mapping

In [42]:
# Define the mapping
room_type_map = {
    'Entire home/apt': 4,
    'Hotel room': 3,
    'Private room': 2,
    'Shared room': 1
}

# Apply the mapping to training and test sets
X_train_label = X_train[['room_type']].replace(room_type_map)
X_test_label = X_test[['room_type']].replace(room_type_map)

# Combine with your other encoded features

X_train_t = pd.concat(
    [X_train_encoded.reset_index(drop=True),
     X_train_label.reset_index(drop=True)],
    axis=1
)

X_test_t = pd.concat(
    [X_test_encoded.reset_index(drop=True),
     X_test_label.reset_index(drop=True)],
    axis=1
)



  X_train_label = X_train[['room_type']].replace(room_type_map)
  X_test_label = X_test[['room_type']].replace(room_type_map)


Knn

In [43]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor


knn = KNeighborsRegressor(n_neighbors=160)
knn.fit(X_train_t, y_train)


y_train_pred = knn.predict(X_train_t)
y_test_pred = knn.predict(X_test_t)


train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)


test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("KNN Regression Performance:")

print("\n--- Train Set ---")
print(f"R2 score: {train_r2:.4f}")
print(f"Mean Squared Error: {train_mse:.4f}")
print(f"Mean Absolute Error: {train_mae:.4f}")
print(f"Root Mean Squared Error: {train_rmse:.4f}")

print("\n--- Test Set ---")
print(f"R2 score: {test_r2:.4f}")
print(f"Mean Squared Error: {test_mse:.4f}")
print(f"Mean Absolute Error: {test_mae:.4f}")
print(f"Root Mean Squared Error: {test_rmse:.4f}")


KNN Regression Performance:

--- Train Set ---
R2 score: 0.1803
Mean Squared Error: 34021.1548
Mean Absolute Error: 48.0629
Root Mean Squared Error: 184.4482

--- Test Set ---
R2 score: 0.3447
Mean Squared Error: 17075.8640
Mean Absolute Error: 47.8511
Root Mean Squared Error: 130.6746


Obs: test performance is better than train. Underfitting (Train r2 = 0,18 < Test r2 = 0.34). Results still modest


LinearRegression

In [44]:
from sklearn.linear_model import LinearRegression


lr = LinearRegression()
lr.fit(X_train_t, y_train)


y_train_pred = lr.predict(X_train_t)
y_test_pred = lr.predict(X_test_t)

#Train
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Test
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("Linear Regression Performance:")

print("\n--- Train Set ---")
print(f"R2 score: {train_r2:.4f}")
print(f"Mean Squared Error: {train_mse:.4f}")
print(f"Mean Absolute Error: {train_mae:.4f}")
print(f"Root Mean Squared Error: {train_rmse:.4f}")

print("\n--- Test Set ---")
print(f"R2 score: {test_r2:.4f}")
print(f"Mean Squared Error: {test_mse:.4f}")
print(f"Mean Absolute Error: {test_mae:.4f}")
print(f"Root Mean Squared Error: {test_rmse:.4f}")




Linear Regression Performance:

--- Train Set ---
R2 score: 0.1680
Mean Squared Error: 34532.9907
Mean Absolute Error: 54.3950
Root Mean Squared Error: 185.8305

--- Test Set ---
R2 score: 0.3570
Mean Squared Error: 16756.6050
Mean Absolute Error: 53.3023
Root Mean Squared Error: 129.4473


OBS: Underfitting: Linear Regression  low train R² → the models are too simple to fully capture the data patterns. (Train r2 = 0,17 < Test r2 = 0.36)

This is unusual but can happen if the train set has outliers or higher variance than the test set.

Decision Tree Regressor

In [45]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor(
    max_depth=10,       # try tuning this
    min_samples_split=2,
    max_leaf_nodes=30   # similar to controlling depth
)

dt_reg.fit(X_train_t, y_train)

y_train_pred = dt_reg.predict(X_train_t)
y_test_pred = dt_reg.predict(X_test_t)

# Train
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Test
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("Decision Tree Regression Performance:")

print("\n--- Train Set ---")
print(f"R2 score: {train_r2:.4f}")
print(f"Mean Squared Error: {train_mse:.4f}")
print(f"Mean Absolute Error: {train_mae:.4f}")
print(f"Root Mean Squared Error: {train_rmse:.4f}")

print("\n--- Test Set ---")
print(f"R2 score: {test_r2:.4f}")
print(f"Mean Squared Error: {test_mse:.4f}")
print(f"Mean Absolute Error: {test_mae:.4f}")
print(f"Root Mean Squared Error: {test_rmse:.4f}")


Decision Tree Regression Performance:

--- Train Set ---
R2 score: 0.4848
Mean Squared Error: 21382.8023
Mean Absolute Error: 48.9083
Root Mean Squared Error: 146.2286

--- Test Set ---
R2 score: -0.6160
Mean Squared Error: 42111.4212
Mean Absolute Error: 54.5766
Root Mean Squared Error: 205.2107


Train Set

R² = 0.4848 → Your model explains ~48% of the variance in training data.

MSE = 21382.8, RMSE = 146.23, MAE = 48.91 → Errors are moderate, but we don’t yet know the scale of your target variable.

Test Set

R² = -0.5951 → This is very concerning. Negative R² means your model performs worse than a simple baseline (mean prediction).

MSE = 41567.02, RMSE = 203.88 → Errors are significantly higher than in training.

MAE = 54.33 → Slightly higher than training, but MSE/RMSE increased more, indicating some large errors/outliers in predictions



Random Forest Regressor

In [46]:
from sklearn.ensemble import RandomForestRegressor


# Model
rf_boot = RandomForestRegressor(
    n_estimators=50, 
    max_depth=10, 
    bootstrap=True, 
    random_state=42
)
rf_boot.fit(X_train_t, y_train)


y_train_pred = rf_boot.predict(X_train_t)
y_test_pred  = rf_boot.predict(X_test_t)

# Train
train_r2 = r2_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

# Test
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)


print("Random Forest Regression Performance:")

print("\n--- Train Set ---")
print(f"R² Score: {train_r2:.4f}")
print(f"Mean Squared Error: {train_mse:.4f}")
print(f"Mean Absolute Error: {train_mae:.4f}")
print(f"Root Mean Squared Error: {train_rmse:.4f}")

print("\n--- Test Set ---")
print(f"R² Score: {test_r2:.4f}")
print(f"Mean Squared Error: {test_mse:.4f}")
print(f"Mean Absolute Error: {test_mae:.4f}")
print(f"Root Mean Squared Error: {test_rmse:.4f}")


Random Forest Regression Performance:

--- Train Set ---
R² Score: 0.7926
Mean Squared Error: 8608.6724
Mean Absolute Error: 38.1937
Root Mean Squared Error: 92.7829

--- Test Set ---
R² Score: 0.3895
Mean Squared Error: 15908.0758
Mean Absolute Error: 45.1783
Root Mean Squared Error: 126.1272


Train Set

R² = 0.7907 → The model explains ~79% of the variance in the training data, much better than before.

MSE = 8686.36, RMSE = 93.20, MAE = 38.28 → Errors are significantly lower than your first attempt.

Test Set

R² = 0.3814 → Now the model explains ~38% of the variance in unseen data. This is positive (compared to negative R² before), but there’s still a gap.

MSE = 16119.39, RMSE = 126.96, MAE = 45.44 → Test errors are higher than training, but not as dramatically as before.

2. Key Insights

Overfitting reduced but not eliminated:

The drop from 0.79 (train) → 0.38 (test) shows the model still captures training patterns too specifically.

Improved generalization:

The test set R² went from -0.59 → 0.38, indicating your changes (likely pruning or parameter tuning) helped a lot.

Error distribution:

MAE and RMSE on test are higher than training, but the gap is reasonable. RMSE > MAE suggests some larger errors still exist.

xgboost

In [47]:

import xgboost as xgb


# Define model
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train model
model.fit(X_train_t, y_train)

# Predictions
y_train_pred = model.predict(X_train_t)
y_test_pred  = model.predict(X_test_t)

# --- Train metrics ---
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mae  = mean_absolute_error(y_train, y_train_pred)
train_r2   = r2_score(y_train, y_train_pred)

# --- Test metrics ---
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae  = mean_absolute_error(y_test, y_test_pred)
test_r2   = r2_score(y_test, y_test_pred)

# Results
print("XGBoost Regression Performance:")

print("\n--- Train Set ---")
print(f"R² Score: {train_r2:.3f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"MAE: {train_mae:.2f}")

print("\n--- Test Set ---")
print(f"R² Score: {test_r2:.3f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"MAE: {test_mae:.2f}")





XGBoost Regression Performance:

--- Train Set ---
R² Score: 0.904
RMSE: 63.17
MAE: 32.10

--- Test Set ---
R² Score: 0.472
RMSE: 117.31
MAE: 44.15


Train Set

R² = 0.909 → Excellent fit on training data; the model explains ~91% of the variance.

RMSE = 61.44, MAE = 31.84 → Errors are relatively low, indicating very accurate predictions on training data.

Test Set

R² = 0.462 → The model explains ~46% of the variance in unseen data. This is better than your Decision Tree (0.38) but still not great.

RMSE = 118.39, MAE = 44.37 → Test errors are almost double the training errors, suggesting overfitting.

2. Key Insights

Overfitting is present:

Large gap between train R² (0.91) and test R² (0.46) shows the model learned training patterns too specifically.

Better than single-tree models:

Compared to Decision Tree and your tuned version, XGBoost improves test performance slightly.

MAE is similar, but RMSE is higher on test, indicating some large errors (outliers) remain.

Potential causes of remaining gap:

Data may have noise or outliers.

Features may be missing predictive power or require engineering/transformations.

Hyperparameters may need tuning to reduce overfitting (e.g., learning rate, max depth, subsampling).

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_log = np.log1p(airbnb_crime['price'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
l
model.fit(X_train_t, y_train)

y_train_pred_log = model.predict(X_train_t)
y_test_pred_log  = model.predict(X_test_t)

y_train_pred = np.expm1(y_train_pred_log)
y_test_pred  = np.expm1(y_test_pred_log)
y_train_true = np.expm1(y_train)
y_test_true  = np.expm1(y_test)

train_rmse = np.sqrt(mean_squared_error(y_train_true, y_train_pred))
train_mae  = mean_absolute_error(y_train_true, y_train_pred)
train_r2   = r2_score(y_train_true, y_train_pred)

test_rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
test_mae  = mean_absolute_error(y_test_true, y_test_pred)
test_r2   = r2_score(y_test_true, y_test_pred)

print("XGBoost Regression with Log-Transformed Target")

print("\n--- Train Set ---")
print(f"R² Score: {train_r2:.3f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"MAE: {train_mae:.2f}")

print("\n--- Test Set ---")
print(f"R² Score: {test_r2:.3f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"MAE: {test_mae:.2f}")



XGBoost Regression with Log-Transformed Target

--- Train Set ---
R² Score: 0.549
RMSE: 136.76
MAE: 31.72

--- Test Set ---
R² Score: 0.568
RMSE: 106.08
MAE: 38.00


Train Set

R² = 0.530 → The model explains ~53% of the variance in the log-transformed target, which is lower than your previous raw-target XGBoost train R² (0.91).

RMSE = 139.63, MAE = 31.75 → RMSE increased (because of log transform scale), but MAE stayed low.

Test Set

R² = 0.565 → Now your test R² is higher than train R², which is unusual but can happen with log transform if the raw target has heavy skew or extreme outliers.

RMSE = 106.41, MAE = 38.16 → Test errors are lower than train RMSE (again due to scale differences).

2. Key Insights

Log transformation helped generalization:

Test R² improved from 0.462 → 0.565. That’s a significant boost.

Model is less sensitive to extreme high values because log compresses large numbers.

Overfitting reduced:

Previously, train R² was 0.91 and test R² 0.46 → huge gap.

Now train R² = 0.53 and test R² = 0.565 → the model generalizes better, possibly even underfitting slightly.

Error interpretation:

Because the target is log-transformed, RMSE values are not directly comparable to raw-target RMSE.

MAE (in log-space) is often more stable and indicates reasonable prediction accuracy.

Lasso regression

In [49]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_t, y_train)


print("Train R^2:", lasso.score(X_train_t, y_train))
print("Test R^2:", lasso.score(X_test_t, y_test))
print("Coefficients:", lasso.coef_)

Train R^2: 0.4551969604356766
Test R^2: 0.45973952953688546
Coefficients: [-0.         -0.         -0.         -0.          0.         -0.
  0.         -0.         -0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.00564433  0.28488809]
