In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from keras import models, layers, optimizers, regularizers
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])


### From our work on the seattle data, we've determined what the best columns are. We'll keep those for this dataset

In [2]:
df = pd.read_csv('san_francisco_listings.csv')
df = df.rename(columns={'neighbourhood_cleansed':'neighbourhood_group'})
print(df.columns)
cols_to_keep = ['host_response_time', 'host_response_rate', 'host_has_profile_pic',
       'neighbourhood_group', 'zipcode', 'market', 'smart_location',
       'latitude', 'longitude', 'is_location_exact', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'calendar_updated',
       'has_availability', 'availability_90', 'number_of_reviews', 'first_review', 'last_review',
       'requires_license', 'instant_bookable', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification', 'security_deposit', 'cleaning_fee']


df = df[cols_to_keep]

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       ...
       'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
      dtype='object', length=106)


In [3]:
len(df.columns)


35

In [6]:
print(df.isna().sum())

host_response_time                   927
host_response_rate                   927
host_has_profile_pic                   8
neighbourhood_group                    0
zipcode                              245
market                                21
smart_location                         0
latitude                               0
longitude                              0
is_location_exact                      0
property_type                          0
room_type                              0
accommodates                           0
bathrooms                             12
bedrooms                               4
beds                                   9
bed_type                               0
price                                  0
guests_included                        0
extra_people                           0
minimum_nights                         0
maximum_nights                         0
calendar_updated                       0
has_availability                       0
availability_90 

In [7]:
useless = ['host_response_time', 'host_response_rate', 'zipcode', 
           'first_review', 'last_review', 'cleaning_fee']
df = df.drop(useless, axis=1)

In [8]:
df.columns

Index(['host_has_profile_pic', 'neighbourhood_group', 'market',
       'smart_location', 'latitude', 'longitude', 'is_location_exact',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'price', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'calendar_updated',
       'has_availability', 'availability_90', 'number_of_reviews',
       'requires_license', 'instant_bookable', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'security_deposit'],
      dtype='object')

### Now, we'll perform the same feature engineering and test the same neural networks to confirm neural nets

In [11]:
for col in ['bathrooms', 'bedrooms', 'beds']:
    df[col].fillna(df[col].mean(), inplace=True)

df.price = df.price.str[1:-3]
df.price = df.price.str.replace(",", "")
df.price = df.price.str.replace(".", "") # decimal at end
df.price = df.price.astype('int64')

df.extra_people = df.extra_people.str[1:-3]
df.extra_people = df.extra_people.str.replace(",", "")
df.extra_people = df.extra_people.str.replace(".", "")
df.extra_people = df.extra_people.astype('int64')

# df.cleaning_fee = df.cleaning_fee.str[1:-3]
# df.cleaning_fee = df.cleaning_fee.str.replace(",", "")
# df.cleaning_fee = df.cleaning_fee.str.replace(".", "")
# df.cleaning_fee = df.cleaning_fee.astype('float64')

  df.price = df.price.str.replace(".", "") # decimal at end
  df.extra_people = df.extra_people.str.replace(".", "")


In [12]:
# for col in ['bathrooms', 'bedrooms', 'beds']:
#     df[col].fillna(df[col].mean(), inplace=True)
    
# df.price = df.price.str[1:-3]
# df.price = df.price.str.replace(",", "")
# df.price = df.price.str.replace(".", "") # decimal at end
# df.price = df.price.astype('int64')

# df.extra_people = df.extra_people.str[1:-3]
# df.extra_people = df.extra_people.str.replace(",", "")
# df.extra_people = df.extra_people.str.replace(".", "")
# df.extra_people = df.extra_people.astype('int64')

In [13]:
df['property_type'] = df.property_type.replace({
    'Townhouse': 'House',
    'Bungalow': 'House',
    'Loft': 'Apartment',
    'Chalet': 'Cabin',
    'Tiny house': 'House',
    'Earth house': 'House',
    'Condominium': 'Apartment',
    'Boutique hotel': 'Hotel',
    'Aparthotel': 'Hotel',
    'Serviced apartment': 'Apartment'
    })

In [14]:
transformed_df = pd.get_dummies(df)

# to_drop = ['beds',
#            'bedrooms',
#            'guests_included',
#            'room_type_Private room']
# to_drop.extend(list(transformed_df.columns[transformed_df.columns.str.endswith('nan')]))
# transformed_df = transformed_df.drop(to_drop, axis=1, inplace=False)

numerical_columns = ['accommodates', 'availability_90', 'bathrooms', 'extra_people',
                     'maximum_nights', 'minimum_nights', 'number_of_reviews',
                     'price']

numerical_columns = [i for i in numerical_columns if i not in ['availability_90']]# Removing items not to be transformed

for col in numerical_columns:
    transformed_df[col] = transformed_df[col].astype('float64').replace(0.0, 0.01) # Replacing 0s with 0.01 because log(0) undefined
    transformed_df[col] = np.log(transformed_df[col])

In [15]:
X = transformed_df.drop('price', axis=1)
y = transformed_df.price

scaler = StandardScaler()
print(X.columns)
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

Index(['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'guests_included', 'extra_people', 'minimum_nights',
       'maximum_nights',
       ...
       'security_deposit_$850.00 ', 'security_deposit_$899.00 ',
       'security_deposit_$900.00 ', 'security_deposit_$95.00 ',
       'security_deposit_$950.00 ', 'security_deposit_$960.00 ',
       'security_deposit_$975.00 ', 'security_deposit_$990.00 ',
       'security_deposit_$995.00 ', 'security_deposit_$999.00 '],
      dtype='object', length=266)


In [16]:
def nn_model_evaluation(model, skip_epochs=0, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    print("Training MSE:", round(mean_squared_error(y_train, y_train_pred),4))
    print("Validation MSE:", round(mean_squared_error(y_test, y_test_pred),4))
    print("\nTraining r2:", round(r2_score(y_train, y_train_pred),4))
    print("Validation r2:", round(r2_score(y_test, y_test_pred),4))

### Now, we can test the same neural networks to see what happens

In [17]:
def ModelResults(classifier):
    classifier.fit(X_train, y_train)
    train_predict = classifier.predict(X_train)
    test_predict = classifier.predict(X_test)
    print("\nR^2 Train:", r2_score(y_train, train_predict))
    print("R^2 Test:", r2_score(y_test, test_predict))
    
ETR = ExtraTreesRegressor(max_depth=17, n_estimators=100, n_jobs=-1)
RFR = RandomForestRegressor(max_depth=17, n_estimators=100, n_jobs=-1)
GBR = GradientBoostingRegressor(n_estimators=200, learning_rate=0.2)
ModelResults(ETR)
ModelResults(RFR)
ModelResults(GBR)


R^2 Train: 0.958037509857866
R^2 Test: 0.7191792936689214

R^2 Train: 0.9382484553733835
R^2 Test: 0.725731831965202

R^2 Train: 0.8061763128584237
R^2 Test: 0.7137513726471116


In [14]:
nn4 = models.Sequential()
nn4.add(layers.Dense(128, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(256, kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(256, kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(512, kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(1, activation='linear'))

# Compiling the model
nn4.compile(loss='mean_squared_error',
            optimizer='SGD',
            metrics=['mean_squared_error'])

nn4_history = nn4.fit(X_train,
                  y_train,
                  epochs=200,
                  batch_size=256,
                  validation_split = 0.1)

nn_model_evaluation(nn4)

Train on 5839 samples, validate on 649 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200


Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200


Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200


Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
Training MSE: 0.2115
Validation MSE: 0.1803

Training r2: 0.6075
Validation r2: 0.6395


In [11]:
nn4 = models.Sequential()
nn4.add(layers.Dense(128, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(256, kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(512, kernel_regularizer=regularizers.l1(0.01), activation='relu'))
nn4.add(layers.Dense(1, activation='relu'))

# Compiling the model
nn4.compile(loss='mean_squared_error',
            optimizer='adam',
            metrics=['mean_squared_error'])

nn4_history = nn4.fit(X_train,
                  y_train,
                  epochs=150,
                  batch_size=256,
                  validation_split = 0.1)

nn_model_evaluation(nn4)

Train on 5839 samples, validate on 649 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150


Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150


Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150


Epoch 148/150
Epoch 149/150
Epoch 150/150
Training MSE: 0.1965
Validation MSE: 0.1669

Training r2: 0.6352
Validation r2: 0.6663


In [12]:
nn4 = models.Sequential()
nn4.add(layers.Dense(128, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l1(0.02), activation='relu'))
nn4.add(layers.Dense(256, kernel_regularizer=regularizers.l1(0.02), activation='relu'))
nn4.add(layers.Dense(512, kernel_regularizer=regularizers.l1(0.02), activation='relu'))
nn4.add(layers.Dense(1, activation='relu'))

# Compiling the model
nn4.compile(loss='mean_squared_error',
            optimizer='adam',
            metrics=['mean_squared_error'])

nn4_history = nn4.fit(X_train,
                  y_train,
                  epochs=150,
                  batch_size=256,
                  validation_split = 0.1)

nn_model_evaluation(nn4)

Train on 5839 samples, validate on 649 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150


Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150


Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150
Training MSE: 0.2129
Validation MSE: 0.1812

Training r2: 0.6049
Validation r2: 0.6376
