In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [17]:
df = pd.read_csv('https://raw.githubusercontent.com/bw-airbnb-team-3/data-engineering/master/grpdf.csv', index_col=0)

In [18]:
df.head()

Unnamed: 0,price,beds,bedrooms,bathrooms,zipcode,neighbourhood,property_type,room_type,accommodates,guests_included,minimum_nights,instant_bookable
0,122.0,3,2.0,2.0,90230,Culver City,Condominium,Entire home/apt,6,3,30,0
1,168.0,3,3.0,1.0,91505,Burbank,House,Entire home/apt,6,6,2,1
2,79.0,1,1.0,1.5,90046,Hollywood,Apartment,Private room,1,1,30,1
3,140.0,1,1.0,1.0,90405,Santa Monica,Apartment,Private room,1,1,1,0
4,80.0,1,1.0,1.0,90706,Bellflower,Apartment,Entire home/apt,2,1,2,0


In [19]:
df.columns

Index(['price', 'beds', 'bedrooms', 'bathrooms', 'zipcode', 'neighbourhood',
       'property_type', 'room_type', 'accommodates', 'guests_included',
       'minimum_nights', 'instant_bookable'],
      dtype='object')

In [20]:
df.shape

(40125, 12)

In [32]:
df.isnull().sum()

price               0
beds                0
bedrooms            0
bathrooms           0
zipcode             0
neighbourhood       0
property_type       0
room_type           0
accommodates        0
guests_included     0
minimum_nights      0
instant_bookable    0
dtype: int64

In [25]:
df['bedrooms'].value_counts()

1.0     23235
2.0      6902
0.0      4308
3.0      3277
4.0      1434
5.0       627
6.0       193
7.0        86
8.0        30
9.0        14
10.0        9
11.0        6
12.0        3
21.0        1
Name: bedrooms, dtype: int64

In [24]:
df['bedrooms'] = df['bedrooms'].replace(np.NaN, 0)

In [27]:
df['bathrooms'].value_counts()

1.0     26690
2.0      6102
1.5      2601
3.0      1282
2.5      1224
3.5       540
4.0       410
4.5       299
5.0       191
5.5       151
0.0       111
0.5       106
6.0       103
6.5        65
8.0        58
7.0        46
7.5        28
11.0       27
8.5        18
10.0       13
9.0        12
9.5         6
11.5        4
10.5        3
13.0        3
12.5        2
12.0        2
13.5        1
16.0        1
15.0        1
Name: bathrooms, dtype: int64

In [28]:
df['bathrooms'] = df['bathrooms'].replace(np.NaN, 0)

In [31]:
df['neighbourhood'] = df['neighbourhood'].replace(np.nan, 'No information given')

In [None]:
# descript_feats = ['summary', 'space', 'description']

# bool_feats = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'is_location_exact',
#               'require_guest_profile_picture', 'require_guest_phone_verification']

# date_feats = ['host_since']


In [13]:
# !pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
[K     |████████████████████████████████| 102kB 2.6MB/s ta 0:00:01
Installing collected packages: category-encoders
Successfully installed category-encoders-2.1.0


In [33]:
import category_encoders as ce
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.75, test_size=0.25, random_state=42)

target = "price"

Xtrain = train.drop(columns=target)
ytrain = train[target]
Xtest = test.drop(columns=target)
ytest = test[target]

encoder = ce.OrdinalEncoder()

Xtrainencoded = encoder.fit_transform(Xtrain)
Xtestencoded = encoder.transform(Xtest)

scaler = StandardScaler()

Xtrainscaled = scaler.fit_transform(Xtrainencoded)
Xtestscaled = scaler.transform(Xtestencoded)


In [34]:
print(Xtrainscaled.shape)
Xtrainscaled[:5]

(30093, 11)


array([[ 1.21946705,  0.48970109,  0.51819421,  0.52269842, -1.05585051,
        -0.827191  , -0.7003191 ,  0.48760161, -0.52849331, -0.37049225,
        -0.91970462],
       [ 2.43445255,  4.03214701,  2.01220803,  0.52288533, -1.02462978,
        -0.57874392, -0.7003191 ,  4.58226019,  4.98070342,  0.47467945,
         1.08730562],
       [ 0.6119743 ,  1.37531257,  1.01619881,  0.52339265, -0.99340905,
        -0.33029683, -0.7003191 ,  1.60432668,  2.22610505,  0.89726529,
         1.08730562],
       [-0.60301119, -1.28152186, -0.47781501,  0.5263298 , -0.96218833,
        -0.33029683, -0.7003191 , -0.62912346, -0.52849331,  0.94421928,
        -0.91970462],
       [-0.60301119, -0.39591039, -0.47781501, -1.88750225, -0.9309676 ,
        -0.33029683, -0.7003191 , -0.62912346, -0.52849331, -0.46440022,
        -0.91970462]])

In [35]:
print(ytrain.shape)
ytrain[:5]

(30093,)


19661    1450.0
355       799.0
29974     400.0
36018      85.0
9418      115.0
Name: price, dtype: float64

In [36]:
ytrain = ytrain.to_numpy()
ytest = ytest.to_numpy()

print(ytrain.shape)
ytrain[:5]

(30093,)


array([1450.,  799.,  400.,   85.,  115.])

In [37]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

In [60]:
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(128, input_dim=11, activation='relu'))
    model.add(Dense(64, activation='relu'))  
    model.add(Dense(1, activation='linear'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    return model

model = KerasRegressor(build_fn=create_model, verbose=0)

param_grid = {'batch_size': [15],
              'epochs': [200, 300, 400]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(Xtrainscaled,ytrain)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

In [61]:
model.fit(Xtrainscaled, ytrain, epochs=200, batch_size=15, validation_split=0.2 )


<tensorflow.python.keras.callbacks.History at 0x7f9e0ec32410>

In [62]:
from sklearn.metrics import mean_absolute_error

predictions = model.predict(Xtestscaled)

mean_absolute_error(ytest, predictions)

105.46802314845593