## Linear Regressor (real data)

### We have to predict price!

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
plt.style.use("seaborn-colorblind")
from sklearn.model_selection import train_test_split

In [14]:
used_features = ['property_type','room_type','bathrooms','bedrooms','beds','bed_type','accommodates','host_total_listings_count'
                ,'number_of_reviews','review_scores_value','neighbourhood_cleansed','cleaning_fee','minimum_nights','security_deposit','host_is_superhost',
                 'instant_bookable', 'price']

In [15]:
boston = pd.read_csv('listings.csv', usecols = used_features)#actual data
print(boston.shape)
boston.head(7)

(3585, 17)


Unnamed: 0,host_is_superhost,host_total_listings_count,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,security_deposit,cleaning_fee,minimum_nights,number_of_reviews,review_scores_value,instant_bookable
0,f,1,Roslindale,House,Entire home/apt,4,1.5,2.0,3.0,Real Bed,$250.00,,$35.00,2,0,,f
1,f,1,Roslindale,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$65.00,$95.00,$10.00,2,36,9.0,t
2,t,1,Roslindale,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$65.00,,,3,41,10.0,f
3,f,1,Roslindale,House,Private room,4,1.0,1.0,2.0,Real Bed,$75.00,$100.00,$50.00,1,1,10.0,f
4,t,1,Roslindale,House,Private room,2,1.5,1.0,2.0,Real Bed,$79.00,,$15.00,2,29,10.0,f
5,t,2,Roslindale,Condominium,Private room,2,1.0,1.0,1.0,Real Bed,$75.00,,$30.00,2,8,10.0,f
6,f,5,Roslindale,Apartment,Entire home/apt,3,1.0,1.0,2.0,Real Bed,$100.00,,,1,57,9.0,f


### Remove the dollar sign from prices with lamda function

In [16]:
for feature in ["cleaning_fee","security_deposit","price"]:
    boston[feature] = boston[feature].map(lambda x:x.replace("$",'').replace(",",''),na_action = 'ignore')
    boston[feature] = boston[feature].astype(float)
    boston[feature].fillna(boston[feature].median(),inplace = True) # fill in missing values with median values

### Fill in NAs

In [17]:
for feature in ["bathrooms","bedrooms","beds","review_scores_value"]:
    boston[feature].fillna(boston[feature].median(),inplace = True)

In [18]:
boston['property_type'].fillna('Apartment',inplace = True)

### Responce variable

In [19]:
boston = boston[(boston["price"]>50)&(boston["price"]<500)]
target = np.log(boston.price)

### Predictors

In [21]:
features=boston.drop('price',axis=1)
features.head() # no $ sign anymore

Unnamed: 0,host_is_superhost,host_total_listings_count,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,security_deposit,cleaning_fee,minimum_nights,number_of_reviews,review_scores_value,instant_bookable
0,f,1,Roslindale,House,Entire home/apt,4,1.5,2.0,3.0,Real Bed,250.0,35.0,2,0,9.0,f
1,f,1,Roslindale,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,95.0,10.0,2,36,9.0,t
2,t,1,Roslindale,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,250.0,50.0,3,41,10.0,f
3,f,1,Roslindale,House,Private room,4,1.0,1.0,2.0,Real Bed,100.0,50.0,1,1,10.0,f
4,t,1,Roslindale,House,Private room,2,1.5,1.0,2.0,Real Bed,250.0,15.0,2,29,10.0,f


In [22]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)##split

In [23]:
# Get all the numeric feature names

numeric_columns = ['host_total_listings_count','accommodates','bathrooms','bedrooms','beds',
 'security_deposit','cleaning_fee','minimum_nights','number_of_reviews',
 'review_scores_value']

In [24]:
# Get all the categorical feature names that contains strings

categorical_columns = ['host_is_superhost','neighbourhood_cleansed','property_type','room_type','bed_type','instant_bookable']

### Convert numerical data to feature columns

In [25]:
numeric_features = [tf.feature_column.numeric_column(key = column) for column in numeric_columns]
print(numeric_features[0])

_NumericColumn(key='host_total_listings_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)


### convert categorical data to feature columns

In [28]:
categorical_features = [tf.feature_column.categorical_column_with_vocabulary_list(key = column, 
                                                                                 vocabulary_list = features[column].unique()) 
                                                                                for column in categorical_columns]

In [29]:
linear_features = numeric_features + categorical_features

### Create numerical input function. x=predictor, y=responce

In [31]:
training_input_fn = tf.estimator.inputs.pandas_input_fn(x = X_train,
                                                        y=y_train,
                                                        batch_size=32,
                                                        shuffle= True,
                                                        num_epochs = None)

In [32]:
# create testing input function
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,
                                                    y=y_test,
                                                    batch_size=32,
                                                    shuffle=False,
                                                    num_epochs = 1)

### Linear regressor estimator

In [33]:
linear_regressor = tf.estimator.LinearRegressor(feature_columns=linear_features,
                                                model_dir = "linear_regressor")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_regressor', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001A5A3A9E358>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [34]:
linear_regressor.train(input_fn = training_input_fn,steps=2000)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into linear_regressor\model.ckpt.
INFO:tensorflow:loss = 803.6519, step = 1
INFO:tensorflow:global_step/sec: 281.65
INFO:tensorflow:loss = 21.72494, step = 101 (0.358 sec)
INFO:tensorflow:global_step/sec: 596.83
INFO:tensorflow:loss = 27.212425, step = 201 (0.168 sec)
INFO:tensorflow:global_step/sec: 572.956
INFO:tensorflow:loss = 6.1725473, step = 301 (0.174 sec)
INFO:tensorflow:global_step/sec: 579.521
INFO:tensorflow:loss = 5.5804176, step = 401 (0.175 sec)
INFO:ten

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x1a5a3a9e940>

### evaluate on testing data

In [39]:
linear_regressor.evaluate(input_fn = eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-17-20:36:31
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_regressor\model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-17-20:36:31
INFO:tensorflow:Saving dict for global step 2000: average_loss = 0.14607833, global_step = 2000, label/mean = 4.9801445, loss = 4.595849, prediction/mean = 4.912559
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: linear_regressor\model.ckpt-2000


{'average_loss': 0.14607833,
 'label/mean': 4.9801445,
 'loss': 4.595849,
 'prediction/mean': 4.912559,
 'global_step': 2000}

### predict prices

In [43]:
pred = list(linear_regressor.predict(input_fn = eval_input_fn)) # predict on testing data
pred = [p['predictions'][0] for p in pred]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_regressor\model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [44]:
prices = np.exp(pred) #take exponential
print(prices)

[ 81.930855  85.52491   91.36684  127.816414 126.743454 199.11696
 185.19173  340.20566   83.295876  74.64323  245.02878   78.68419
 246.14174  313.14203  142.32222  238.76483   87.48643  160.66743
 230.47704   65.19992  171.34329  145.12602   72.522194  55.92858
 121.525276  85.6732    84.205475 205.68658  178.85602   98.09506
 166.44803   70.54665   89.345665 131.22133   80.33851  104.59043
 159.52928   80.60765  200.5039   240.04774  180.5203   196.22249
 117.72474  102.63856  140.61818   56.119427  75.35565  199.11192
 105.17593  108.608055 101.75667  178.9305   175.9917   205.91965
 214.26411   41.56269  111.47384   75.03964  204.20244   77.979225
 156.66266   64.61817  226.15784  266.96262  325.49915  145.12602
 115.20181  362.0455   113.6179    89.60071  187.8493   128.48425
  80.51158  102.310585  82.50418  162.16316  161.8963    97.773445
 183.96588  189.17444   81.193436 144.25215  253.51363   81.548225
 149.66136  101.90117  184.84229  137.49753  144.828    282.69812
  86.33