In [4]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

drive.mount('/content/drive')
data = pd.read_csv('/content/drive/My Drive/BT4222 Data Mining/Code & Data/cleaned_data.csv', na_filter=False)
data.drop(['Unnamed: 0', 'property_type'], axis=1, inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,latitude,longitude,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,availability_30,number_of_reviews,review_scores_rating,instant_bookable,reviews_per_month,state,has_license,bathrooms,bathroom_type,num_of_amenities,essentials,luxury,appliances,comfort,entertainment,security,furniture,miscellaneous,description_sentiment,neighborhood_overview_sentiment,host_gender,sentiment_mean_score
0,within an hour,extremely responsive,low,0.0,1.0,32.80751,-117.2576,Entire home/apt,8,2.0,3.0,225.0,4,365,0,88,4.7 to 4.8,0,0.59,San Diego,0,2.0,baths,21,2,2,3,1,2,2,0,9,Slightly Positive,,male,0.879871
1,within an hour,extremely responsive,average,0.0,1.0,32.74217,-117.21931,Private room,1,1.0,1.0,113.0,1,21,20,149,4.3 to 4.4,0,1.02,San Diego,0,1.0,shared,29,4,3,2,2,2,2,1,13,Slightly Positive,Slightly Positive,female,0.79753
2,within an hour,extremely responsive,extremely high,1.0,1.0,32.79783,-117.25416,Entire home/apt,7,1.0,5.0,258.0,6,365,0,162,4.7 to 4.8,1,1.2,San Diego,0,2.5,baths,41,2,4,8,1,1,3,1,21,Neutral,Slightly Positive,female,0.843751
3,within an hour,extremely responsive,extremely high,1.0,1.0,32.80751,-117.25728,Entire home/apt,8,1.0,6.0,336.0,6,90,16,183,4.7 to 4.8,1,1.38,San Diego,0,2.0,baths,52,4,5,8,1,2,3,3,26,Slightly Positive,Neutral,female,0.860047
4,within an hour,extremely responsive,high,1.0,1.0,32.81301,-117.26856,Entire home/apt,3,2.0,3.0,333.0,5,120,6,296,4.9 to 5.0,0,2.08,San Diego,0,1.0,bath,59,6,3,7,1,2,2,2,36,Neutral,Slightly Positive,female,0.903197


In [6]:
data.dtypes

host_response_time                  object
host_response_rate                  object
host_acceptance_rate                object
host_is_superhost                  float64
host_identity_verified             float64
latitude                           float64
longitude                          float64
room_type                           object
accommodates                         int64
bedrooms                           float64
beds                               float64
price                              float64
minimum_nights                       int64
maximum_nights                       int64
availability_30                      int64
number_of_reviews                    int64
review_scores_rating                object
instant_bookable                     int64
reviews_per_month                  float64
state                               object
has_license                          int64
bathrooms                          float64
bathroom_type                       object
num_of_amen

In [18]:
numeric_columns = []
categorical_columns = []
boolean_columns = ['host_is_superhost', 'host_identity_verified', 'has_license', 'instant_bookable']
ordered_columns = ['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'review_scores_rating', 'description_sentiment', 'neighborhood_overview_sentiment']

column_types = data.dtypes
for i in range(len(column_types)):
  if column_types[i] == 'object' and column_types.index[i] not in ordered_columns:
    categorical_columns.append(column_types.index[i])
  elif column_types.index[i] not in boolean_columns and column_types[i] != 'object':
    numeric_columns.append(column_types.index[i])

print(numeric_columns)
print(categorical_columns)
print(boolean_columns)
print(ordered_columns)

['latitude', 'longitude', 'accommodates', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_30', 'number_of_reviews', 'reviews_per_month', 'bathrooms', 'num_of_amenities', 'essentials', 'luxury', 'appliances', 'comfort', 'entertainment', 'security', 'furniture', 'miscellaneous', 'sentiment_mean_score']
['room_type', 'state', 'bathroom_type', 'host_gender']
['host_is_superhost', 'host_identity_verified', 'has_license', 'instant_bookable']
['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'review_scores_rating', 'description_sentiment', 'neighborhood_overview_sentiment']


In [8]:
data[data.select_dtypes('object').columns].nunique().reset_index(name='cardinality')

Unnamed: 0,index,cardinality
0,host_response_time,5
1,host_response_rate,7
2,host_acceptance_rate,7
3,room_type,4
4,review_scores_rating,34
5,state,8
6,bathroom_type,6
7,description_sentiment,8
8,neighborhood_overview_sentiment,8
9,host_gender,3


In [43]:
from sklearn.model_selection import train_test_split

train, other = train_test_split(data, train_size=0.7)
val, test = train_test_split(other, test_size=0.5)
train.reset_index(inplace=True)
val.reset_index(inplace=True)
test.reset_index(inplace=True)

In [44]:
X_train = train.drop(['price'], axis=1)
y_train = train['price']

X_val = val.drop(['price'], axis=1)
y_val = val['price']

X_test = test.drop(['price'], axis=1)
y_test = test['price']

# Using One-Hot Encoding

In [51]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

enc_X_train = pd.DataFrame()
enc_X_val = pd.DataFrame()
enc_X_test = pd.DataFrame()

for column in X_train.columns:
  if column in categorical_columns:
    enc = OneHotEncoder(handle_unknown='ignore')
    enc_col_train = enc.fit_transform(X_train[[column]])
    feature_names = enc.get_feature_names_out()
    enc_X_train[feature_names] = pd.DataFrame.sparse.from_spmatrix(enc_col_train, columns=feature_names)

    enc_col_val = enc.transform(X_val[[column]])
    enc_X_val[feature_names] = pd.DataFrame.sparse.from_spmatrix(enc_col_val, columns=feature_names)

    enc_col_test = enc.transform(X_val[[column]])
    enc_X_test[feature_names] = pd.DataFrame.sparse.from_spmatrix(enc_col_test, columns=feature_names)
  elif column in numeric_columns:
    scaler = StandardScaler()
    transformed_col_train = scaler.fit_transform(X_train[[column]])
    enc_X_train[column] = transformed_col_train

    transformed_col_val = scaler.transform(X_val[[column]])
    enc_X_val[column] = transformed_col_val

    transformed_col_test = scaler.transform(X_test[[column]])
    enc_X_test[column] = transformed_col_test
  elif column in boolean_columns:
    enc_X_train[column] = X_train[[column]]
    enc_X_val[column] = X_val[[column]]
    enc_X_test[column] = X_test[[column]]

for column in ordered_columns:
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    enc_col_train = enc.fit_transform(X_train[[column]])
    enc_X_train[column] = enc_col_train

    enc_col_val = enc.transform(X_val[[column]])
    enc_X_val[column] = enc_col_val
    
    enc_col_test = enc.transform(X_test[[column]])
    enc_X_test[column] = enc_col_test

In [52]:
enc_X_train.head()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,accommodates,bedrooms,beds,minimum_nights,maximum_nights,availability_30,number_of_reviews,instant_bookable,reviews_per_month,state_Los Angeles,state_Oakland,state_Pacific Grove,state_San Diego,state_San Francisco,state_San Mateo County,state_Santa Clara County,state_Santa Cruz County,has_license,bathrooms,bathroom_type_NA,bathroom_type_bath,bathroom_type_baths,bathroom_type_half-bath,bathroom_type_private,bathroom_type_shared,num_of_amenities,essentials,luxury,appliances,comfort,entertainment,security,furniture,miscellaneous,host_gender_female,host_gender_male,host_gender_unknown,sentiment_mean_score,host_response_time,host_response_rate,host_acceptance_rate,review_scores_rating,description_sentiment,neighborhood_overview_sentiment
0,0.0,1.0,-0.352883,0.355082,1.0,0.0,0.0,0.0,1.015209,1.180123,0.412326,-0.368554,-0.048206,0.319855,-0.512596,1,-0.190145,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.430007,0.0,0.0,1.0,0.0,0.0,0.0,0.133493,1.032952,-0.784872,0.711204,0.774702,-0.388282,-0.405614,-0.761956,0.025229,0.0,0.0,1.0,0.734743,4.0,2.0,2.0,21.0,4.0,0.0
1,0.0,1.0,-0.304823,0.240874,0.0,0.0,1.0,0.0,-0.029406,0.279666,-0.1522,0.492833,0.130383,-1.010235,0.048695,0,-0.423814,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.430007,0.0,0.0,0.0,0.0,0.0,1.0,-0.342971,-1.010104,1.354526,-0.442271,0.774702,0.885145,-0.405614,-0.761956,-0.432887,1.0,0.0,0.0,0.868816,0.0,0.0,0.0,30.0,4.0,2.0
2,1.0,1.0,-1.06757,1.00047,1.0,0.0,0.0,0.0,2.059824,1.180123,0.412326,-0.434815,-0.127396,0.497201,-0.288079,1,0.831479,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.430007,0.0,0.0,1.0,0.0,0.0,0.0,2.447745,1.032952,-0.071739,1.095696,0.774702,0.885145,0.754637,3.625324,2.888454,1.0,0.0,0.0,0.724163,4.0,1.0,2.0,30.0,4.0,2.0
3,0.0,1.0,1.754781,-1.753646,1.0,0.0,0.0,0.0,1.363414,2.08058,1.541379,-0.401685,-0.127396,0.319855,-0.525069,1,-0.733562,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.430007,0.0,0.0,1.0,0.0,0.0,0.0,1.018354,1.032952,0.641393,0.711204,0.774702,0.885145,-0.405614,1.870412,0.826932,0.0,1.0,0.0,-1.696726,4.0,1.0,2.0,31.0,4.0,2.0
4,1.0,1.0,-0.292896,0.254712,1.0,0.0,0.0,0.0,-0.725817,-0.620791,-0.716726,-0.467945,-0.126926,-0.832889,-0.21324,0,1.505316,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,-0.558052,0.0,1.0,0.0,0.0,0.0,0.0,0.405758,1.032952,0.641393,-0.826763,0.774702,-0.388282,-0.405614,0.992956,0.483345,0.0,0.0,1.0,0.50771,4.0,1.0,2.0,30.0,4.0,5.0


In [53]:
enc_X_train.shape

(56423, 52)

In [73]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, BatchNormalization

model1 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

model1.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

In [74]:
model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 32)                1696      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,729
Trainable params: 1,729
Non-trainable params: 0
_________________________________________________________________


In [75]:
EPOCHS = 50
BATCH_SIZE = 256
STEPS = X_train.shape[0] // BATCH_SIZE

history = model1.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [76]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model1.predict(enc_X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2_square = r2_score(y_val, y_pred)
print('\nMAE:', mae)
print('\nMSE:', mse)
print('\nRMSE:', rmse)
print('\nR2 Square', r2_square)


MAE: 129.54906219706348

MSE: 464604.3284060094

RMSE: 681.6189026178847

R2 Square 0.15403280828148302


# Log-Price vs Price

In [60]:
log_y_train = np.log(y_train)
log_y_val = np.log(y_val)

In [77]:
history = model1.fit(
  enc_X_train, log_y_train,
  validation_data=(enc_X_val, log_y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [78]:
log_y_pred = model1.predict(enc_X_val)
mae = mean_absolute_error(log_y_val, log_y_pred)
mse = mean_squared_error(log_y_val, log_y_pred)
rmse = np.sqrt(mean_squared_error(log_y_val, log_y_pred))
r2_square = r2_score(log_y_val, log_y_pred)
print('\nMAE:', mae)
print('\nMSE:', mse)
print('\nRMSE:', rmse)
print('\nR2 Square', r2_square)


MAE: 0.4370185998041734

MSE: 1.1546995696528966

RMSE: 1.0745694810727209

R2 Square -0.5610477444052846


Seems like log-price does not perform as well as price.

# Hyperparameter Tuning: Number of Nodes
## 256 Nodes

In [79]:
model2 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(256, activation='relu'),
    Dense(1)
])

model2.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

In [80]:
model2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 256)               13568     
                                                                 
 dense_9 (Dense)             (None, 1)                 257       
                                                                 
Total params: 13,825
Trainable params: 13,825
Non-trainable params: 0
_________________________________________________________________


In [81]:
history = model2.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [82]:
y_pred = model2.predict(enc_X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
r2_square = r2_score(y_train, y_pred)
print('\nMAE:', mae)
print('\nMSE:', mse)
print('\nRMSE:', rmse)
print('\nR2 Square', r2_square)


MAE: 117.4770989439889

MSE: 454769.5019682801

RMSE: 674.3660000091049

R2 Square 0.18037412754517101


## 512 Nodes

In [83]:
model3 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(512, activation='relu'),
    Dense(1)
])

model3.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

In [84]:
model3.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 512)               27136     
                                                                 
 dense_11 (Dense)            (None, 1)                 513       
                                                                 
Total params: 27,649
Trainable params: 27,649
Non-trainable params: 0
_________________________________________________________________


In [85]:
history = model3.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [86]:
y_pred = model3.predict(enc_X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2_square = r2_score(y_val, y_pred)
print('\nMAE:', mae)
print('\nMSE:', mse)
print('\nRMSE:', rmse)
print('\nR2 Square', r2_square)


MAE: 120.41449211505119

MSE: 433733.7233754506

RMSE: 658.5846364556727

R2 Square 0.21024304449248


# Number of Layers

In [87]:
model4 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(1)
])

model4.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

In [88]:
model4.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 512)               27136     
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_13 (Dense)            (None, 512)               262656    
                                                                 
 batch_normalization_1 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 512)              

In [89]:
history = model4.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [90]:
y_pred = model4.predict(enc_X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2_square = r2_score(y_val, y_pred)
print('\nMAE:', mae)
print('\nMSE:', mse)
print('\nRMSE:', rmse)
print('\nR2 Square', r2_square)


MAE: 114.14885271704173

MSE: 327523.00311460893

RMSE: 572.29625467463

R2 Square 0.4036350971617487


# Archive

In [None]:
for column in columns_to_category:
  X_train[column] = X_train[column].astype('category')
  X_test[column] = X_test[column].astype('category')

In [None]:
enc_X_train = X_train.copy()
enc_X_test = X_test.copy()
for column in categorical_columns:
  enc_X_train[column] = enc_X_train[column].cat.codes
  enc_X_test[column] = enc_X_test[column].cat.codes

In [None]:
from sklearn.preprocessing import StandardScaler

for column in numeric_columns:
  scaler = StandardScaler()
  enc_X_train[column] = scaler.fit_transform(enc_X_train[[column]])
  enc_X_test[column] = scaler.transform(enc_X_test[[column]])

In [None]:
enc_X_train.head()

Unnamed: 0,description_sentiment,neighborhood_overview_sentiment,host_response_time,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,essentials,luxury,appliances,entertainment,security,comfort,furniture,miscellaneous,availability_30,number_of_reviews,review_scores_rating,instant_bookable,bathroom_qty,bathroom_type
0,2,0,4,0,-0.199155,1,1,-1.230307,1.124547,19,0,-0.030868,0.192831,-0.161844,0.010727,-0.783664,-0.440834,-0.385913,-1.569216,-0.800473,0.994949,0.140698,0.944069,-0.027127,27,1,1,1
1,5,0,4,0,-0.20402,1,0,-0.306263,0.122492,19,0,-0.030868,0.192831,-0.718963,-1.523132,-0.069257,-1.20968,-0.385913,-0.408914,-0.800473,-0.760963,-0.661392,1.032936,-0.52511,33,0,1,1
2,5,0,4,0,-0.20402,1,1,-1.043569,0.970018,15,0,-0.030868,0.192831,-0.161844,-0.500559,0.645151,-0.056411,-0.385913,-0.408914,-0.800473,0.116993,-0.661392,-0.47781,-0.22632,29,0,1,1
3,5,0,4,1,-0.192669,1,1,-0.36389,0.227839,17,0,-0.726006,-0.671228,-0.718963,-0.500559,-0.069257,0.328012,-0.385913,-0.408914,0.77572,-0.760963,-0.317639,-1.011014,-0.375715,32,1,1,1
4,2,0,4,1,-0.19429,1,1,-0.548818,0.538765,53,2,-1.073575,-0.671228,-0.718963,1.544587,-0.783664,-0.056411,-0.385913,-1.569216,-0.800473,0.994949,0.599035,-0.566677,-0.52511,33,0,1,4


In [None]:
enc_X_train.shape

(64440, 28)

In [None]:
from sklearn.model_selection import train_test_split

enc_X_val, enc_X_test, enc_y_val, enc_y_test = train_test_split(enc_X_test, y_test, test_size=0.5)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, BatchNormalization

model1 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(15, activation='relu'),
    Dense(1)
])

model1.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

In [None]:
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 15)                435       
                                                                 
 dense_1 (Dense)             (None, 1)                 16        
                                                                 
Total params: 451
Trainable params: 451
Non-trainable params: 0
_________________________________________________________________


In [None]:
EPOCHS = 50
BATCH_SIZE = 256
STEPS = X_train.shape[0] // BATCH_SIZE

history = model1.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, enc_y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model1.evaluate(enc_X_test, enc_y_test)



[143.04856872558594, 143.04856872558594, 346189.3125]

In [None]:
model2 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(1)
])

model2.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

history = model2.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, enc_y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model3 = Sequential([
    Input(shape=(enc_X_train.shape[1],)),
    Dense(15, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(15, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(15, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(15, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(1)
])

model3.compile(optimizer='adam', 
              loss='mae',
              metrics=['mae', 'mse'])

EPOCHS=100

history = model3.fit(
  enc_X_train, y_train,
  validation_data=(enc_X_val, enc_y_val),
  steps_per_epoch = STEPS,
  batch_size = BATCH_SIZE,
  epochs=EPOCHS)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
model3.evaluate(enc_X_test, enc_y_test)



[141.98683166503906, 141.98683166503906, 350076.59375]

In [None]:
from sklearn.linear_model import LinearRegression

new_X_train = X_train[['accommodates', 'bedrooms']]
model = LinearRegression()
model.fit(new_X_train, y_train)
y_pred = model.predict(new_X_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
r2_square = r2_score(y_train, y_pred)
print('\nMAE:', mae)
print('\nMSE:', mse)
print('\nRMSE:', rmse)
print('\nR2 Square', r2_square)


MAE: 155.88386534742628

MSE: 245488.75695002434

RMSE: 495.4682199193247

R2 Square 0.16960915135652954


# Using Keras Example

In [None]:
def df_to_dataset(dataframe, target, shuffle=True, batch_size=32):
  df = dataframe.copy()
  labels = target
  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [None]:
batch_size = 5
train_ds = df_to_dataset(X_train, y_train, batch_size=batch_size)

  import sys


In [None]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of description_sentiments:', train_features['description_sentiment'])
print('A batch of targets:', label_batch )

Every feature: ['description_sentiment', 'neighborhood_overview_sentiment', 'host_response_time', 'host_is_superhost', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds', 'essentials', 'luxury', 'appliances', 'entertainment', 'security', 'comfort', 'furniture', 'miscellaneous', 'availability_30', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'bathroom_qty', 'bathroom_type']
A batch of description_sentiments: tf.Tensor(
[[b'Neutral']
 [b'Neutral']
 [b'Neutral']
 [b'Neutral']
 [b'Neutral']], shape=(5, 1), dtype=string)
A batch of targets: tf.Tensor([ 50. 259.  68.  35. 201.], shape=(5,), dtype=float64)


In [None]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [None]:
beds_col = train_features['beds']
layer = get_normalization_layer('beds', train_ds)
layer(beds_col)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[-0.7189815],
       [-0.1618574],
       [-0.7189815],
       [-0.7189815],
       [-0.1618574]], dtype=float32)>

In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [None]:
test_bathroom_type_col = train_features['bathroom_type']
test_type_layer = get_category_encoding_layer(name='bathroom_type',
                                              dataset=train_ds,
                                              dtype='string')
test_type_layer(test_bathroom_type_col)

<tf.Tensor: shape=(5, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

In [None]:
batch_size = 256
train_ds = df_to_dataset(X_train, y_train, batch_size=batch_size)
test_ds = df_to_dataset(X_test, y_test, shuffle=False, batch_size=batch_size)

  import sys


In [None]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in numeric_columns:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [None]:
for header in categorical_columns:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='string')
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
all_features

<KerasTensor: shape=(None, 462) dtype=float32 (created by layer 'concatenate')>

In [None]:
x = tf.keras.layers.Dense(309, activation="relu")(all_features)
x = tf.keras.layers.Dense(309, activation="relu")(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.MeanSquaredError())
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 description_sentiment (InputLa  [(None, 1)]         0           []                               
 yer)                                                                                             
                                                                                                  
 neighborhood_overview_sentimen  [(None, 1)]         0           []                               
 t (InputLayer)                                                                                   
                                                                                                  
 host_response_time (InputLayer  [(None, 1)]         0           []                               
 )                                                                                            

In [None]:
history = model.fit(train_ds, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
pred = model.predict(test_ds)



In [None]:
model.evaluate(test_ds)



161709.125

In [None]:
y_test

0         334.0
1         105.0
2         329.0
3         151.0
4         180.0
          ...  
16117     116.0
16118     456.0
16119    5999.0
16120      99.0
16121     100.0
Name: price, Length: 16122, dtype: float64