In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor


In [2]:
#load datasets

airbnb_crime = pd.read_csv(r'C:\Madhuri\projects\ML_project\Test_ML\cleaned_airbnb_crime - Final.csv')

In [None]:

def target_encode(train_series, target_series, smoothing=10):
    global_mean = target_series.mean()
    agg = train_series.to_frame().join(target_series).groupby(train_series.name)[target_series.name].agg(['mean','count'])
    smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
    return train_series.map(smooth), smooth, global_mean

y = np.log1p(airbnb_crime['price'])
X = airbnb_crime.drop(columns=['price'])

# Convert date columns to numeric (days since a reference date)
if 'last_review' in X.columns:
    X['last_review'] = pd.to_datetime(X['last_review'])
    reference_date = X['last_review'].max()
    X['days_since_last_review'] = (reference_date - X['last_review']).dt.days
    X = X.drop(columns=['last_review'])

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Target encode categorical columns
categorical_cols = ['neighbourhood_group_cleansed', 'neighbourhood_cleansed', 'room_type']  
for col in categorical_cols:
    if col in X_train.columns:
        train_encoded, encoding_map, global_mean = target_encode(X_train[col], y_train)
        X_train[col + '_enc'] = train_encoded
        X_test[col + '_enc'] = X_test[col].map(encoding_map).fillna(global_mean)

# Drop original categorical columns
X_train = X_train.drop(columns=[c for c in categorical_cols if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in categorical_cols if c in X_test.columns])

# Keep only numeric columns
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

if 'neighbourhood_group_cleansed_enc' in X_train.columns and 'room_type_enc' in X_train.columns:
    X_train['neigh_room_interaction'] = X_train['neighbourhood_group_cleansed_enc'] * X_train['room_type_enc']
    X_test['neigh_room_interaction'] = X_test['neighbourhood_group_cleansed_enc'] * X_test['room_type_enc']

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': [200, 400, 600],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=25,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = np.expm1(model.predict(X_train)) 
    y_pred_test = np.expm1(model.predict(X_test))
    
    y_train_actual = np.expm1(y_train)
    y_test_actual = np.expm1(y_test)
    
    print('--- Train Set ---')
    print('R² Score:', r2_score(y_train_actual, y_pred_train))
    print('RMSE:', np.sqrt(mean_squared_error(y_train_actual, y_pred_train)))
    print('MAE:', mean_absolute_error(y_train_actual, y_pred_train))
    
    print('\n--- Test Set ---')
    print('R² Score:', r2_score(y_test_actual, y_pred_test))
    print('RMSE:', np.sqrt(mean_squared_error(y_test_actual, y_pred_test)))
    print('MAE:', mean_absolute_error(y_test_actual, y_pred_test))

evaluate_model(best_model, X_train, y_train, X_test, y_test)



Fitting 3 folds for each of 25 candidates, totalling 75 fits
--- Train Set ---
R² Score: 0.49976648522357203
RMSE: 144.08929840055987
MAE: 31.632898851195815

--- Test Set ---
R² Score: 0.540241574867182
RMSE: 109.457127196927
MAE: 37.35639466505199


In [None]:
from sklearn.linear_model import LassoCV

# LassoCV will try multiple alpha values and pick the best one
lasso_cv = LassoCV(alphas=np.logspace(-4, 2, 50), 
                   cv=5,  
                   max_iter=10000,
                   random_state=42)

lasso_cv.fit(X_train, y_train)

print("Best alpha (regularization strength):", lasso_cv.alpha_)

# Predict using the tuned model
y_train_pred = lasso_cv.predict(X_train)
y_test_pred = lasso_cv.predict(X_test)

# Evaluation metrics
print("Train R2:", r2_score(y_train, y_train_pred))
print("Test R2:", r2_score(y_test, y_test_pred))

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Best alpha (regularization strength): 0.0001
Train R2: 0.6507107460250772
Test R2: 0.6718972998642503


  model = cd_fast.enet_coordinate_descent(


In [None]:
# This runs for 9 mins
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': np.logspace(-4, 2, 50)}

lasso = Lasso(max_iter=10000, random_state=42)

grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best alpha:", grid_search.best_params_['alpha'])
print("Best CV score:", grid_search.best_score_)

# Use best model
best_lasso = grid_search.best_estimator_
y_test_pred = best_lasso.predict(X_test)

print("Test R2:", r2_score(y_test, y_test_pred))


Best alpha: 0.0001
Best CV score: 0.6494196703855215
Test R2: 0.6718972998642503


  model = cd_fast.enet_coordinate_descent(
