In [42]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split, KFold
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, Ridge
from scipy import stats

SEED = 42

In [43]:
df = pd.read_csv('listings.csv.gz')
#df = pd.read_csv('../data/listings.csv')

In [44]:
irrelevant_columns = [ 'id', 'listing_url', 'scrape_id', 'last_scraped', 
                     'source', 'name', 'description', 'neighborhood_overview', 
                     'picture_url', 'host_id', 'host_url', 'host_name', 'host_location', 
                     'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 
                     'host_total_listings_count', 'host_verifications', 'neighbourhood', 
                     'neighbourhood_group_cleansed', 'property_type', 'bathrooms_text', 'amenities', 
                     'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 
                     'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 
                     'calendar_updated', 'calendar_last_scraped', 'number_of_reviews_l30d', 'first_review', 
                     'license',  'calculated_host_listings_count_entire_homes', 
                     'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms' ]


df_cleaned = df.drop(columns=irrelevant_columns)

In [45]:
df_cleaned = df_cleaned.dropna(subset=['price'])
df_cleaned['price'] = df_cleaned['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

In [46]:
def convert_percentage_to_decimal(percentage_str):
    try: 
        float(percentage_str)
        return 0
    except:    
        p = percentage_str.strip('%')
    
        return int(p) / 100

In [47]:
df_cleaned['host_response_rate'] = df_cleaned['host_response_rate'].apply(convert_percentage_to_decimal)
df_cleaned['host_acceptance_rate'] = df_cleaned['host_acceptance_rate'].apply(convert_percentage_to_decimal)

In [48]:
df_cleaned = df_cleaned.select_dtypes(exclude=['object'])

In [51]:
df_cleaned = df_cleaned.dropna()

<class 'pandas.core.frame.DataFrame'>
Index: 6348 entries, 0 to 9995
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   host_response_rate              6348 non-null   float64
 1   host_acceptance_rate            6348 non-null   float64
 2   host_listings_count             6348 non-null   int64  
 3   latitude                        6348 non-null   float64
 4   longitude                       6348 non-null   float64
 5   accommodates                    6348 non-null   int64  
 6   bathrooms                       6348 non-null   float64
 7   bedrooms                        6348 non-null   float64
 8   beds                            6348 non-null   float64
 9   price                           6348 non-null   float64
 10  minimum_nights                  6348 non-null   int64  
 11  maximum_nights                  6348 non-null   int64  
 12  availability_30                 6348 no

#### Feature Selection

In [52]:
def scale(df):
    scaler = MinMaxScaler()
    columns_to_scale = list(df.columns)
    columns_to_scale.remove('price')

    # Apply MinMaxScaler to the DataFrame
    df_scaled = pd.DataFrame(scaler.fit_transform(df[columns_to_scale]), columns=columns_to_scale)

    # Add the 'price' column back to the scaled DataFrame
    df_scaled['price'] = df['price'].values
    return df_scaled

In [63]:
def feature_selection(df):
    #scale the features first
    df = scale(df)
       
    X = df.drop('price', axis=1).copy()
    y = df['price'].copy()
    
    kf = KFold(n_splits=30, shuffle=True, random_state=SEED)

    rf_importances = np.zeros(X.shape[1])        
    ols_pvalues = np.zeros(X.shape[1])
    perm_importances = np.zeros(X.shape[1])
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
               
        rf_model = RandomForestRegressor(random_state=SEED)
        rf_model.fit(X_train, y_train)
        
        rf_importances += rf_model.feature_importances_
       
        ols_model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
        ols_pvalues += ols_model.pvalues[1:]  #exclude the intercept p-value
        
        perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=50, random_state=SEED)
        perm_importances += perm_importance.importances_mean
    
    #average scores 
    rf_importances /= kf.get_n_splits()
    ols_pvalues /= kf.get_n_splits()
    ols_pvalues = ols_pvalues.round(5)
    perm_importances /= kf.get_n_splits()

    importance_df1 = pd.DataFrame({
        'Feature': X.columns,
        'RF Feature Importance': rf_importances,
    }).sort_values(by='RF Feature Importance', ascending=False)
  
    importance_df_ols = pd.DataFrame({
        'OLS P-Value': ols_pvalues,
    }).sort_values(by='OLS P-Value', ascending=True)
    
    perm_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Permutation Importance': perm_importances
    }).sort_values(by='Permutation Importance', ascending=False)

    return importance_df1, importance_df_ols, perm_importance_df

In [64]:
df_rf, df_ols, df_rf_perm = feature_selection(df_cleaned)

In [65]:
df_rf

Unnamed: 0,Feature,RF Feature Importance
9,minimum_nights,0.097574
10,maximum_nights,0.092554
7,bedrooms,0.088508
4,longitude,0.08731
3,latitude,0.086387
25,reviews_per_month,0.079716
5,accommodates,0.070075
15,number_of_reviews,0.06332
13,availability_90,0.044718
14,availability_365,0.038345


Determine which features to use, set a threshold. Could be either 0.01 or 0.05 e.g. | Use K-fold to decrease the impact of the random state --> a more representative view

In [66]:
df_ols

Unnamed: 0,OLS P-Value
latitude,0.0
longitude,0.0
accommodates,0.0
bedrooms,0.0
host_response_rate,6e-05
bathrooms,0.00011
review_scores_rating,0.00349
maximum_nights,0.00848
review_scores_value,0.03779
availability_60,0.03927


Keep features for which the p-value < 0.05

In [67]:
df_rf_perm

Unnamed: 0,Feature,Permutation Importance
15,number_of_reviews,0.564527
7,bedrooms,0.456494
10,maximum_nights,0.324794
9,minimum_nights,0.32069
25,reviews_per_month,0.223678
5,accommodates,0.142367
4,longitude,0.074279
3,latitude,0.068563
14,availability_365,0.059456
6,bathrooms,0.058178


"The concept is really straightforward: We measure the importance of a feature by calculating the increase in the model’s prediction error after permuting the feature. A feature is “important” if shuffling its values increases the model error, because in this case the model relied on the feature for the prediction. A feature is “unimportant” if shuffling its values leaves the model error unchanged, because in this case the model ignored the feature for the prediction" -- Analysis done on the test set