In [1]:
import pandas as pd
import xgboost as xgb

In [2]:
data = pd.read_csv('./data/listings_summary.csv')

In [3]:
# Credit to https://www.kaggle.com/jrw2200/smart-pricing-with-xgb-rfr-interpretations
null_vals = data.isnull()
print(null_vals.sum().sort_values(ascending=False))

xl_picture_url                      22552
jurisdiction_names                  22552
thumbnail_url                       22552
medium_url                          22552
host_acceptance_rate                22552
square_feet                         22106
license                             20914
monthly_price                       19893
weekly_price                        18871
notes                               15337
host_response_rate                  12895
host_response_time                  12894
interaction                         12146
access                              11715
host_about                          11363
house_rules                         11103
neighborhood_overview               11012
transit                              9516
security_deposit                     9361
space                                8532
cleaning_fee                         7146
host_neighbourhood                   5094
review_scores_value                  4435
review_scores_checkin             

In [56]:

# Initial dataset to test xgboost
dataset = data[[
    'price',
    'latitude',
    'longitude',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'beds',
    'square_feet',
    'guests_included',
    'extra_people',
    'availability_30',
    'availability_60',
    'availability_90',
    'instant_bookable',
    'neighbourhood_cleansed',
    'neighbourhood_group_cleansed'
]]

In [57]:
# Credit to https://stackoverflow.com/questions/32464280/converting-currency-with-to-numbers-in-python-pandas
# Convert price to numerical data type
dataset['price'] = data['price'].replace('[\$,]', '', regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [58]:

# Convert square feet values of 0 to nulls
count = 0
zero_indices = []
for i, row in dataset.iterrows():
    if row['square_feet'] == 0:
        zero_indices.append(i)
        count += 1
        
for i in zero_indices:
    dataset['square_feet'][i] = None
    
null_vals = dataset.isnull()
print(null_vals.sum().sort_values(ascending=False))
print(count)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


square_feet                     22229
beds                               40
bathrooms                          32
bedrooms                           18
latitude                            0
longitude                           0
property_type                       0
room_type                           0
accommodates                        0
neighbourhood_group_cleansed        0
neighbourhood_cleansed              0
guests_included                     0
extra_people                        0
availability_30                     0
availability_60                     0
availability_90                     0
instant_bookable                    0
price                               0
dtype: int64
123


In [59]:
label = dataset['price']
features = dataset.drop('price', axis=1)
print(features.dtypes)

latitude                        float64
longitude                       float64
property_type                    object
room_type                        object
accommodates                      int64
bathrooms                       float64
bedrooms                        float64
beds                            float64
square_feet                     float64
guests_included                   int64
extra_people                     object
availability_30                   int64
availability_60                   int64
availability_90                   int64
instant_bookable                 object
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
dtype: object


In [60]:
# Use the label encoder to convert categorical data to numerical data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for feature in features:
    if features[feature].dtype == 'object':
        # .astype(str) Credit to https://stackoverflow.com/questions/46406720/labelencoder-typeerror-not-supported-between-instances-of-float-and-str
        features[feature] = encoder.fit_transform(features[feature].astype(str))
print(features.dtypes)

object
True
latitude                        float64
longitude                       float64
property_type                     int32
room_type                         int32
accommodates                      int64
bathrooms                       float64
bedrooms                        float64
beds                            float64
square_feet                     float64
guests_included                   int64
extra_people                      int32
availability_30                   int64
availability_60                   int64
availability_90                   int64
instant_bookable                  int32
neighbourhood_cleansed            int32
neighbourhood_group_cleansed      int32
dtype: object


In [61]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
features_train, features_test, label_train, label_test = train_test_split(features, label, train_size=0.8, test_size=0.2)

In [62]:
# xgboost matrices : https://xgboost.readthedocs.io/en/latest/python/python_intro.html
training_set = xgb.DMatrix(features_train, label=label_train)
testing_set = xgb.DMatrix(features_test, label=label_test)

In [63]:
# Standard parameters
# link to docs: https://xgboost.readthedocs.io/en/latest/parameter.html

params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.3
}

evals = [(testing_set, 'eval'), (training_set, 'train')]

rounds = 10

In [64]:
# Create the training model
model = xgb.train(params, training_set, rounds, evals)

[0]	eval-rmse:198.95004	train-rmse:181.07890
[1]	eval-rmse:174.33640	train-rmse:151.59882
[2]	eval-rmse:161.06218	train-rmse:136.79370
[3]	eval-rmse:153.39098	train-rmse:123.66845
[4]	eval-rmse:149.49338	train-rmse:113.50694
[5]	eval-rmse:146.47311	train-rmse:108.64426
[6]	eval-rmse:145.16409	train-rmse:105.53800
[7]	eval-rmse:145.28490	train-rmse:94.04858
[8]	eval-rmse:145.16605	train-rmse:91.08363
[9]	eval-rmse:145.33783	train-rmse:88.56729


In [65]:
import statistics
# Test the model
predictions = model.predict(testing_set)
labels = testing_set.get_label()

print(labels)
print(predictions)

# Measure the difference between testing data and results for accuracy
diffs = labels - predictions
distance = []
for diff in diffs:
    distance.append(abs(diff))

print(sum(distance)/len(distance))
print(statistics.median(distance))

[ 39.  49. 100. ...  25.  19.  40.]
[63.315113 37.450256 91.10319  ... 39.082695 31.891462 62.33912 ]
25.41834570788576
12.314811706542969


[ 65.  35.  60. ... 100.  40.  45.]
[60.043514 25.374012 86.96003  ... 94.99803  43.850475 31.784338]


[  4.9564857   9.625988  -26.96003   ...   5.0019684  -3.8504753
  13.215662 ]


[  4.9564857   9.625988  -26.96003   ...   5.0019684  -3.8504753
  13.215662 ]
[4.9564857, 9.625988, 26.96003, 22.85477, 50.828773, 26.416775, 6.0341988, 0.374012, 12.784338, 17.85477, 7.872307, 14.723515, 11.231461, 129.77538, 35.61106, 9.925083, 2.6050835, 43.866776, 38.276485, 19.567066, 3.9773903, 3.7938232, 15.206177, 74.56763, 14.605083, 21.681969, 38.000122, 20.240318, 12.854771, 186.89331, 5.567066, 45.57711, 18.215662, 10.899796, 6.7235146, 12.074917, 11.074917, 58.89263, 6.074917, 40.276485, 10.034199, 56.39956, 2.7255783, 20.899796, 2.4914017, 23.825226, 25.520905, 23.825226, 39.953003, 25.240318, 4.074917, 16.784338, 1.9250832, 25.998032, 9.074917, 47.479095, 8.683662, 21.231823, 10.826847, 3.768177, 9.074917, 3.2764854, 9.074917, 9.173153, 1.2764854, 162.24695, 9.840904, 15.394917, 49.713196, 17.074917, 21.185806, 19.07956, 8.768539, 3.215662, 63.463753, 13.8667755, 30.001968, 29.725578, 25.14523, 35.33584, 8.215662, 5.0399704, 30.001968, 27.717827, 14.71587, 6.1576767, 8.