In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
from geopy.distance import geodesic
import seaborn as sns
from sklearn.neighbors import BallTree
from collections import Counter
import ast
import math

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [3]:
df=pd.read_parquet('/Users/mjymjy007/Documents/GitHub/Fall-2025-Team-Big-Data/data/nyc/nyc_final_modeling_dataset.parquet')

In [4]:
room_type_dummies=pd.get_dummies(df['room_type'])
df=pd.concat([df,room_type_dummies],axis=1)
min_count = 100
freq = df['property_type'].value_counts()
valid_types = freq[freq >= min_count].index
print("Keeping property_type types:", list(valid_types))

# Step 3: Filter the dataframe
df = df[df['property_type'].isin(valid_types)].copy()

# Step 4: Recreate dummies only for valid room types
property_type_dummies = pd.get_dummies(df['property_type'])
df = pd.concat([df, property_type_dummies], axis=1)

bathrooms_type_dummies = pd.get_dummies(df['bathrooms_type'])
df = pd.concat([df, bathrooms_type_dummies], axis=1)

Keeping property_type types: ['Entire rental unit', 'Private room in rental unit', 'Private room in home', 'Entire home', 'Private room in townhouse', 'Room in hotel', 'Entire condo', 'Entire guest suite', 'Private room in condo', 'Entire townhouse', 'Room in boutique hotel', 'Entire loft', 'Private room in guest suite', 'Entire serviced apartment', 'Room in aparthotel', 'Private room in loft', 'Shared room in rental unit', 'Private room in bed and breakfast', 'Private room in casa particular', 'Entire place', 'Entire guesthouse', 'Private room in hostel', 'Private room in serviced apartment', 'Private room in guesthouse', 'Private room']


In [5]:
df = df.dropna(subset=['target_price'])
df = df[df['target_price'] > 0]

In [6]:
lower = df['target_price'].quantile(0.05)
upper = df['target_price'].quantile(0.95)

# Keep only rows within [5th, 95th] percentile
df = df[(df['target_price'] >= lower) & (df['target_price'] <= upper)].copy()

In [7]:
df['price_per_person'] = df['target_price'] / df['accommodates']

In [8]:
airbnb_coords = np.radians(df[['latitude', 'longitude']].values)
stops=pd.read_csv('/Users/mjymjy007/Documents/GitHub/Fall-2025-Team-Big-Data/main/stops.txt')
stop_coords   = np.radians(stops[['stop_lat', 'stop_lon']].values)

# Build BallTree on stops
tree = BallTree(stop_coords, metric='haversine')

# Query nearest stop for each Airbnb
dist, ind = tree.query(airbnb_coords, k=1)

# Convert from radians to km (Earth radius ≈ 6371 km)
df['public_distance_km'] = dist.flatten() * 6371

In [9]:

df_train, df_test = train_test_split(df,test_size=.2,random_state=42)

In [10]:
features_group1=['bedrooms', 'beds','bathrooms_numeric', 'private','shared','Entire rental unit', 'Private room in rental unit', 'Private room in home', 'Entire home', 'Private room in townhouse', 'Room in hotel', 'Entire condo', 'Entire guest suite', 'Private room in condo', 'Entire townhouse', 'Room in boutique hotel', 'Entire loft', 'Private room in guest suite', 'Entire serviced apartment', 'Room in aparthotel', 'Private room in loft', 'Shared room in rental unit', 'Private room in bed and breakfast', 'Private room in casa particular', 'Entire place', 'Entire guesthouse', 'Private room in hostel', 'Private room in serviced apartment', 'Private room in guesthouse', 'Private room','accommodates','Entire home/apt', 'Hotel room', 'Private room', 'Shared room','estimated_occupancy_rate']
features_group2=['public_distance_km','estimated_occupancy_rate']
features_group3=['review_scores_rating', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication','review_scores_location', 'review_scores_value','host_is_superhost', 'host_response_rate', 'host_acceptance_rate', 'host_identity_verified', 'instant_bookable','estimated_occupancy_rate']

In [11]:
feature_groups = {
    "Group1": features_group1,
    "Group2": features_group2,
    "Group3": features_group3,
    "All": list(set(features_group1 + features_group2 + features_group3))
}

In [None]:
y_train = df_train['target_price']  # or 'price_per_person'
y_test = df_test['target_price']

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# feature_groups is a dictionary: {'group1': features1, ...}
for name, features in feature_groups.items():
    print(f"\n===== Using features: {name} =====")
    X_train = df_train[features]
    X_test = df_test[features]
    
    # --- Random Forest ---
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='r2')
    print(f"RandomForest -> CV R²: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    print(f"RandomForest -> Test R²: {r2_score(y_test, y_pred_rf):.4f}")
    print(f"RandomForest -> Test MSE: {mean_squared_error(y_test, y_pred_rf):.2f}")
    
    # --- LassoCV ---
    lasso_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', LassoCV(alphas=np.logspace(-3, 3, 13), cv=cv, random_state=42))
    ])
    lasso_scores = cross_val_score(lasso_pipeline, X_train, y_train, cv=cv, scoring='r2')
    print(f"LassoCV      -> CV R²: {lasso_scores.mean():.4f} ± {lasso_scores.std():.4f}")
    lasso_pipeline.fit(X_train, y_train)
    y_pred_lasso = lasso_pipeline.predict(X_test)
    print(f"LassoCV      -> Test R²: {r2_score(y_test, y_pred_lasso):.4f}")
    print(f"LassoCV      -> Test MSE: {mean_squared_error(y_test, y_pred_lasso):.2f}")
    
    # --- RidgeCV ---
    ridge_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('ridgecv', RidgeCV(alphas=np.logspace(-3, 3, 13), cv=cv))
    ])
    ridge_scores = cross_val_score(ridge_pipeline, X_train, y_train, cv=cv, scoring='r2')
    print(f"RidgeCV      -> CV R²: {ridge_scores.mean():.4f} ± {ridge_scores.std():.4f}")
    ridge_pipeline.fit(X_train, y_train)
    y_pred_ridge = ridge_pipeline.predict(X_test)
    print(f"RidgeCV      -> Test R²: {r2_score(y_test, y_pred_ridge):.4f}")
    print(f"RidgeCV      -> Test MSE: {mean_squared_error(y_test, y_pred_ridge):.2f}")
  


===== Using features: Group1 =====
RandomForest -> CV R²: 0.4761 ± 0.0051
RandomForest -> Test R²: 0.4817
RandomForest -> Test MSE: 4519.53


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gra

LassoCV      -> CV R²: 0.3885 ± 0.0035


  model = cd_fast.enet_coordinate_descent(


LassoCV      -> Test R²: 0.3849
LassoCV      -> Test MSE: 5363.06
RidgeCV      -> CV R²: 0.3885 ± 0.0035
RidgeCV      -> Test R²: 0.3849
RidgeCV      -> Test MSE: 5363.49
KNN -> Best k: 20, CV R²: 0.4555
KNN -> Test R²: 0.4574
KNN -> Test MSE: 4731.58

===== Using features: Group2 =====
RandomForest -> CV R²: 0.4542 ± 0.0096
RandomForest -> Test R²: 0.5094
RandomForest -> Test MSE: 4277.56
LassoCV      -> CV R²: 0.0279 ± 0.0016
LassoCV      -> Test R²: 0.0235
LassoCV      -> Test MSE: 8514.86
RidgeCV      -> CV R²: 0.0279 ± 0.0016
RidgeCV      -> Test R²: 0.0235
RidgeCV      -> Test MSE: 8514.87
KNN -> Best k: 3, CV R²: 0.0961
KNN -> Test R²: 0.1503
KNN -> Test MSE: 7408.58

===== Using features: Group3 =====
RandomForest -> CV R²: 0.4484 ± 0.0031
RandomForest -> Test R²: 0.4880
RandomForest -> Test MSE: 4464.65
LassoCV      -> CV R²: 0.0767 ± 0.0041
LassoCV      -> Test R²: 0.0683
LassoCV      -> Test MSE: 8124.32
RidgeCV      -> CV R²: 0.0767 ± 0.0041
RidgeCV      -> Test R²: 0.0683


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gra

LassoCV      -> CV R²: 0.4168 ± 0.0057


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


LassoCV      -> Test R²: 0.4133
LassoCV      -> Test MSE: 5115.50
RidgeCV      -> CV R²: 0.4168 ± 0.0058
RidgeCV      -> Test R²: 0.4133
RidgeCV      -> Test MSE: 5115.94


or set the environment variable OPENBLAS_NUM_THREADS to 128 or lower


In [None]:
y_train = df_train['price_per_person']  # or 'price_per_person'
y_test = df_test['price_per_person']

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# feature_groups is a dictionary: {'group1': features1, ...}
for name, features in feature_groups.items():
    print(f"\n===== Using features: {name} =====")
    X_train = df_train[features]
    X_test = df_test[features]
    
    # --- Random Forest ---
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='r2')
    print(f"RandomForest -> CV R²: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    print(f"RandomForest -> Test R²: {r2_score(y_test, y_pred_rf):.4f}")
    print(f"RandomForest -> Test MSE: {mean_squared_error(y_test, y_pred_rf):.2f}")
    
    # --- LassoCV ---
    lasso_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', LassoCV(alphas=np.logspace(-3, 3, 13), cv=cv, random_state=42))
    ])
    lasso_scores = cross_val_score(lasso_pipeline, X_train, y_train, cv=cv, scoring='r2')
    print(f"LassoCV      -> CV R²: {lasso_scores.mean():.4f} ± {lasso_scores.std():.4f}")
    lasso_pipeline.fit(X_train, y_train)
    y_pred_lasso = lasso_pipeline.predict(X_test)
    print(f"LassoCV      -> Test R²: {r2_score(y_test, y_pred_lasso):.4f}")
    print(f"LassoCV      -> Test MSE: {mean_squared_error(y_test, y_pred_lasso):.2f}")
    
    # --- RidgeCV ---
    ridge_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('ridgecv', RidgeCV(alphas=np.logspace(-3, 3, 13), cv=cv))
    ])
    ridge_scores = cross_val_score(ridge_pipeline, X_train, y_train, cv=cv, scoring='r2')
    print(f"RidgeCV      -> CV R²: {ridge_scores.mean():.4f} ± {ridge_scores.std():.4f}")
    ridge_pipeline.fit(X_train, y_train)
    y_pred_ridge = ridge_pipeline.predict(X_test)
    print(f"RidgeCV      -> Test R²: {r2_score(y_test, y_pred_ridge):.4f}")
    print(f"RidgeCV      -> Test MSE: {mean_squared_error(y_test, y_pred_ridge):.2f}")
  

In [None]:
y_train = np.log(df_train['price_per_person'])  # or 'price_per_person'
y_test = np.log(df_test['price_per_person'])

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# feature_groups is a dictionary: {'group1': features1, ...}
for name, features in feature_groups.items():
    print(f"\n===== Using features: {name} =====")
    X_train = df_train[features]
    X_test = df_test[features]
    
    # --- Random Forest ---
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='r2')
    print(f"RandomForest -> CV R²: {rf_scores.mean():.4f} ± {rf_scores.std():.4f}")
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    print(f"RandomForest -> Test R²: {r2_score(y_test, y_pred_rf):.4f}")
    print(f"RandomForest -> Test MSE: {mean_squared_error(y_test, y_pred_rf):.2f}")
    
    # --- LassoCV ---
    lasso_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', LassoCV(alphas=np.logspace(-3, 3, 13), cv=cv, random_state=42))
    ])
    lasso_scores = cross_val_score(lasso_pipeline, X_train, y_train, cv=cv, scoring='r2')
    print(f"LassoCV      -> CV R²: {lasso_scores.mean():.4f} ± {lasso_scores.std():.4f}")
    lasso_pipeline.fit(X_train, y_train)
    y_pred_lasso = lasso_pipeline.predict(X_test)
    print(f"LassoCV      -> Test R²: {r2_score(y_test, y_pred_lasso):.4f}")
    print(f"LassoCV      -> Test MSE: {mean_squared_error(y_test, y_pred_lasso):.2f}")
    
    # --- RidgeCV ---
    ridge_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('ridgecv', RidgeCV(alphas=np.logspace(-3, 3, 13), cv=cv))
    ])
    ridge_scores = cross_val_score(ridge_pipeline, X_train, y_train, cv=cv, scoring='r2')
    print(f"RidgeCV      -> CV R²: {ridge_scores.mean():.4f} ± {ridge_scores.std():.4f}")
    ridge_pipeline.fit(X_train, y_train)
    y_pred_ridge = ridge_pipeline.predict(X_test)
    print(f"RidgeCV      -> Test R²: {r2_score(y_test, y_pred_ridge):.4f}")
    print(f"RidgeCV      -> Test MSE: {mean_squared_error(y_test, y_pred_ridge):.2f}")
    