# Median House Price Prediction using Census Data of California

### Importing Libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import shutil

In [2]:
tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

### Reading Data

In [3]:
data_df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=',')

In [4]:
data_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66900.0
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80100.0
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85700.0
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73400.0
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65500.0


In [5]:
data_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207300.9
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,115983.8
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,14999.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119400.0
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180400.0
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265000.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500001.0


### Splitting into train and test data

In [14]:
np.random.seed(1234)
train_records = np.random.rand(len(data_df)) < 0.8
train_df = data_df[train_records]
test_df = data_df[~train_records]
print(train_df.shape)
print(test_df.shape)

(13628, 9)
(3372, 9)


### Adding extra features

In [15]:
def add_new_features(df):
    df['num_rooms'] = df['total_rooms']/df['households']
    return(df)

### Input Function

In [16]:
def make_input_function(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(batch_size=128,
                                        num_epochs=num_epochs,
                                        x=add_new_features(df),
                                        y=df['median_house_value'],
                                        shuffle=True,
                                        num_threads=1,
                                        queue_capacity=1000)

### Creating Feature Columns

In [20]:
def create_features():
    return [
        tf.feature_column.numeric_column('housing_median_age'),
        tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'), boundaries = np.arange(32.0, 42, 1).tolist()),
        tf.feature_column.numeric_column('num_rooms'),
        tf.feature_column.numeric_column('median_income')
    ]

### Creating Regressor

In [33]:
def train_and_evaluate(output_dir, num_train_steps):
    estimator = tf.estimator.LinearRegressor(feature_columns=create_features(), model_dir=output_dir)
    train_spec = tf.estimator.TrainSpec(input_fn = make_input_function(train_df, 8),
                                       max_steps = num_train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn = make_input_function(test_df, 1), 
                                    steps = None, 
                                    start_delay_secs = 1, # start evaluating after N seconds, 
                                    throttle_secs = 10) 
    tf.estimator.train_and_evaluate(estimator=estimator,train_spec=train_spec,eval_spec=eval_spec)

In [34]:
OUTDIR = '../Output'
shutil.rmtree(OUTDIR, ignore_errors = True)
train_and_evaluate(OUTDIR, 5000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4d1d32bd90>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '../Output', '_save_summary_steps': 100}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 10 secs (eval_spec.throttle_secs) or training is finished.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into ../Output/model.ckpt.
INFO:tensorflow:loss = 6.14635e+12, step = 1
INFO:tensorflow:global_step/sec: 402.912
INFO:tensorflow:loss = 8.19713e+12, step = 101 (0.250 sec)
INFO:tensorflow:global_step/sec: 429.173
INFO:tensorflow:loss = 6.89695e+12, step = 201 (0.233 sec)
INFO:tensorflow:global_step/sec: 451.522
INFO:tensorflow:loss = 6.7968e+12, step = 301 (0.221 sec)
INFO:tensorflow:global_step/sec: 466.276
INFO:tensorflow:loss = 4.22282e+12, step = 401 (0.214 sec)
INFO:tensorflow:global_step/sec: 465.307
INFO:tensorflow:loss = 5.32459e+12, step = 501 (0.215 sec)
INFO:tensorflow:global_step/sec: 460.45
INFO:tensorflow:loss = 1.1303e+13, step = 601 (0.223 sec)
INFO:tensorflow:global_step/sec: 464.46
INFO:tensorflow:loss = 9.88629e+12, step = 701 (0.210 sec)
INFO:tensorflow:global_step/sec: 469.799
INFO:tensorflow:loss = 1.16774e+13, step = 801 (0.213 sec)
INFO:tensorflow:Saving checkpoints for 852 into

INFO:tensorflow:Loss for final step: 4.61929e+12.
INFO:tensorflow:Starting evaluation at 2018-06-24-15:13:37
INFO:tensorflow:Restoring parameters from ../Output/model.ckpt-5000
INFO:tensorflow:Finished evaluation at 2018-06-24-15:13:37
INFO:tensorflow:Saving dict for global step 5000: average_loss = 5.45271e+10, global_step = 5000, loss = 6.80983e+12
