In [1]:
import warnings
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import os


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

In [2]:
def RandomForestRegression(directory, param):
    # listings = pd.read_csv('./data/WashingtonDC/listings.csv')
    listings = pd.read_csv(directory, compression='gzip')
    df = listings.copy()

    df['price_num'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

    df['taken_30'] = 30-df['availability_30']
    df['taken_60'] = 60-df['availability_60']

    df['sales_30'] = df['taken_30'] * df['price_num']
    df['sales_60'] = df['taken_60'] * df['price_num']

    # Column types
    df['host_since'] = pd.to_datetime(df['host_since'])
    df['first_review'] = pd.to_datetime(df['first_review'])
    df['last_review'] = pd.to_datetime(df['last_review'])

    specific_date = pd.to_datetime('2024-04-04')
    df['host_days_since'] = (specific_date - df['host_since']).dt.days
    df['host_desc_len'] = [0 if pd.isna(i) else len(i) for i in df['host_about']]

    # Calculate Q1 (25th percentile of the data) for the 'price' column
    Q1 = df['price_num'].quantile(0.25)

    # Calculate Q3 (75th percentile of the data) for the 'price' column
    Q3 = df['price_num'].quantile(0.75)

    # Calculate the IQR by subtracting Q1 from Q3
    IQR = Q3 - Q1

    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers
    df_filtered = df[(df['price_num'] <= upper_bound)]

    top_10 = df_filtered['neighbourhood_cleansed'].value_counts().head(10).keys()
    top10_df = df_filtered[df_filtered['neighbourhood_cleansed'].isin(top_10)]
    temp = top10_df[top10_df['neighbourhood_cleansed'].isin(top_10)]
    temp = temp[temp['property_type'].str.contains('entire', case=False, na=False)]

    df_num = temp[['price_num','latitude','longitude','accommodates','beds','minimum_nights','maximum_nights','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','reviews_per_month','host_days_since','host_desc_len','neighbourhood_cleansed']]

    df_num_drop = df_num.dropna()

    # 1. Separate the 'neighbourhood_cleansed' column from the rest of the DataFrame
    numerical_features = df_num_drop.drop(['neighbourhood_cleansed'], axis=1)
    non_numerical_feature = df_num_drop[['neighbourhood_cleansed']]

    # 2. Scale only the numerical columns
    scaler = StandardScaler()
    numerical_features_scaled = scaler.fit_transform(numerical_features)
    df_numerical_scaled = pd.DataFrame(numerical_features_scaled, columns=numerical_features.columns)

    # 3. Concatenate the scaled numerical columns and the 'host_is_superhost' column
    df_standardized = pd.concat([df_numerical_scaled, non_numerical_feature.reset_index(drop=True)], axis=1)
    df_standardized = pd.get_dummies(df_standardized, columns=['neighbourhood_cleansed'])


    # Split data into training and testing sets
    X = df_standardized.drop(columns=['price_num'])
    y = df_standardized['price_num']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, max_depth=7, min_samples_split=10, min_samples_leaf=2, random_state=42)
    model.fit(X_train, y_train)

    # Print feature importances
    feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    # print("Top 10 Feature Importances:\n", feature_importances[:10])

    # Make predictions on the test set
    test_predictions = model.predict(X_test)
    train_predictions = model.predict(X_train)

    # Calculate and print the metrics
    test_mse = mean_squared_error(y_test, test_predictions)
    test_r2 = r2_score(y_test, test_predictions)
    train_r2 = r2_score(y_train, train_predictions)

    return feature_importances, train_r2, test_r2

In [3]:
def RandomForestRegression(directory, param):
    # listings = pd.read_csv('./data/WashingtonDC/listings.csv')
    listings = pd.read_csv(directory, compression='gzip')
    df = listings.copy()

    df['price_num'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

    df['taken_30'] = 30-df['availability_30']
    df['taken_60'] = 60-df['availability_60']

    df['sales_30'] = df['taken_30'] * df['price_num']
    df['sales_60'] = df['taken_60'] * df['price_num']

    # Column types
    df['host_since'] = pd.to_datetime(df['host_since'])
    df['first_review'] = pd.to_datetime(df['first_review'])
    df['last_review'] = pd.to_datetime(df['last_review'])

    specific_date = pd.to_datetime('2024-04-04')
    df['host_days_since'] = (specific_date - df['host_since']).dt.days
    df['host_desc_len'] = [0 if pd.isna(i) else len(i) for i in df['host_about']]

    # Calculate Q1 (25th percentile of the data) for the 'price' column
    Q1 = df['taken_30'].quantile(0.25)

    # Calculate Q3 (75th percentile of the data) for the 'price' column
    Q3 = df['taken_30'].quantile(0.75)

    # Calculate the IQR by subtracting Q1 from Q3
    IQR = Q3 - Q1

    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers
    df_filtered = df[(df['taken_30'] <= upper_bound)]

    top_10 = df_filtered['neighbourhood_cleansed'].value_counts().head(10).keys()
    top10_df = df_filtered[df_filtered['neighbourhood_cleansed'].isin(top_10)]
    temp = top10_df[top10_df['neighbourhood_cleansed'].isin(top_10)]
    temp = temp[temp['property_type'].str.contains('entire', case=False, na=False)]

    df_num = temp[['taken_30','latitude','longitude','accommodates','beds','minimum_nights','maximum_nights','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','calculated_host_listings_count','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms','reviews_per_month','host_days_since','host_desc_len','neighbourhood_cleansed']]

    df_num_drop = df_num.dropna()

    # 1. Separate the 'neighbourhood_cleansed' column from the rest of the DataFrame
    numerical_features = df_num_drop.drop(['neighbourhood_cleansed'], axis=1)
    non_numerical_feature = df_num_drop[['neighbourhood_cleansed']]

    # 2. Scale only the numerical columns
    scaler = StandardScaler()
    numerical_features_scaled = scaler.fit_transform(numerical_features)
    df_numerical_scaled = pd.DataFrame(numerical_features_scaled, columns=numerical_features.columns)

    # 3. Concatenate the scaled numerical columns and the 'host_is_superhost' column
    df_standardized = pd.concat([df_numerical_scaled, non_numerical_feature.reset_index(drop=True)], axis=1)
    df_standardized = pd.get_dummies(df_standardized, columns=['neighbourhood_cleansed'])


    # Split data into training and testing sets
    X = df_standardized.drop(columns=['taken_30'])
    y = df_standardized['taken_30']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, max_depth=7, min_samples_split=10, min_samples_leaf=2, random_state=42)
    model.fit(X_train, y_train)

    # Print feature importances
    feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    # print("Top 10 Feature Importances:\n", feature_importances[:10])

    # Make predictions on the test set
    test_predictions = model.predict(X_test)
    train_predictions = model.predict(X_train)

    # Calculate and print the metrics
    test_mse = mean_squared_error(y_test, test_predictions)
    test_r2 = r2_score(y_test, test_predictions)
    train_r2 = r2_score(y_train, train_predictions)

    return feature_importances, train_r2, test_r2

In [4]:
directory_path = './data/'
entries = os.listdir('./data/')

params = {
    'n_estimators': 100,
    'max_depth': 7,
    'min_samples_split':10,
    'min_samples_leaf':2
}

results = {}

# Loop over each entry and check if it is a directory
for entry in entries:
    full_path = os.path.join(directory_path, entry)
    full_path = full_path + '/listings.csv.gz'

    feature, train_r2, test_r2 = RandomForestRegression(full_path,params)

    results[entry] = {}
    results[entry]['feature_importance'] = feature
    results[entry]['train_r2'] = train_r2
    results[entry]['test_r2'] = test_r2

    print(f"{entry} complete")


LosAngeles complete
NewYork complete
Paris complete
Rome complete
Seattle complete
Singapore complete
WashingtonDC complete


In [5]:
[(i,results[i]['test_r2']) for i in results]

[('LosAngeles', 0.1030757061958073),
 ('NewYork', 0.1449784997969492),
 ('Paris', 0.140543386034386),
 ('Rome', 0.12810739876601962),
 ('Seattle', 0.16527351814092373),
 ('Singapore', 0.23078877481849147),
 ('WashingtonDC', 0.07617059004075877)]

In [6]:
[(i,results[i]['train_r2']) for i in results]

[('LosAngeles', 0.21754317781838794),
 ('NewYork', 0.27588499254896226),
 ('Paris', 0.1947165751975366),
 ('Rome', 0.20688008671902325),
 ('Seattle', 0.41635011212861595),
 ('Singapore', 0.6718576388061162),
 ('WashingtonDC', 0.3862194438123988)]

In [7]:
[(i,results[i]['feature_importance'][:10]) for i in results]

[('LosAngeles',
  calculated_host_listings_count    0.113060
  reviews_per_month                 0.104788
  latitude                          0.089332
  host_days_since                   0.080341
  minimum_nights                    0.066419
  review_scores_value               0.063612
  longitude                         0.058923
  number_of_reviews_ltm             0.046604
  review_scores_accuracy            0.044014
  maximum_nights                    0.038819
  dtype: float64),
 ('NewYork',
  minimum_nights                                 0.174774
  host_days_since                                0.074633
  number_of_reviews_ltm                          0.066715
  maximum_nights                                 0.062280
  latitude                                       0.061333
  longitude                                      0.056496
  calculated_host_listings_count_entire_homes    0.055767
  number_of_reviews_l30d                         0.053741
  reviews_per_month                   