### "TensorFlow machine learning with Calilfornia housing data"

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from matplotlib import pyplot as plt
%tensorflow_version 1.x
import tensorflow as tf
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from sklearn import metrics
from tensorflow.python.data import Dataset
from __future__ import print_function
import math

#### Answer the following question by providing Python code:

In [None]:
# Bring the data.
housing_data = fetch_california_housing()

In [None]:
# Read the description.
print(housing_data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [None]:

df = pd.read_csv('housing.csv',sep=",")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.2,37.9,41.0,880.0,129.0,322.0,126.0,8.3,452600.0,NEAR BAY
1,-122.2,37.9,21.0,7099.0,1106.0,2401.0,1138.0,8.3,358500.0,NEAR BAY
2,-122.2,37.9,52.0,1467.0,190.0,496.0,177.0,7.3,352100.0,NEAR BAY
3,-122.2,37.9,52.0,1274.0,235.0,558.0,219.0,5.6,341300.0,NEAR BAY
4,-122.2,37.9,52.0,1627.0,280.0,565.0,259.0,3.8,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.1,39.5,25.0,1665.0,374.0,845.0,330.0,1.6,78100.0,INLAND
20636,-121.2,39.5,18.0,697.0,150.0,356.0,114.0,2.6,77100.0,INLAND
20637,-121.2,39.4,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.3,39.4,18.0,1860.0,409.0,741.0,349.0,1.9,84700.0,INLAND


In [None]:
#preprocessing
df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)
df.head()
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.6,35.6,28.6,2635.8,536.8,1425.5,499.5,3.9,206855.8
std,2.0,2.1,12.6,2181.6,419.4,1132.5,382.3,1.9,115395.6
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,14999.0
25%,-121.8,33.9,18.0,1447.8,297.0,787.0,280.0,2.6,119600.0
50%,-118.5,34.3,29.0,2127.0,435.0,1166.0,409.0,3.5,179700.0
75%,-118.0,37.7,37.0,3148.0,643.2,1725.0,605.0,4.7,264725.0
max,-114.3,42.0,52.0,39320.0,6445.0,35682.0,6082.0,15.0,500001.0


2). Build a machine learning model with TensorFlow. 
- Preprocess the data if necessary.
- Build a linear regression model.
- Train the model.
- Calculate the error metrics such as MSE and RMSE (in-sample and out-of-sample). Target: RMSE < 1.

In [None]:
# Define the input feature
input = df[["total_rooms"]]

In [None]:
# Configure a numeric feature column for total_rooms
feature_columns = [tf.feature_column.numeric_column("total_rooms")]

In [None]:
# Define the target
targets = df.median_house_value

In [None]:
# Gradient descent
optim=tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
optim = tf.contrib.estimator.clip_gradients_by_norm(optim, 5.0)

In [None]:
#Apply the linear regression
linear_regressor = tf.estimator.LinearRegressor(feature_columns=feature_columns,optimizer=optim)

In [None]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
  
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    ds = Dataset.from_tensor_slices((features,targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
      ds = ds.shuffle(buffer_size=10000)
    
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [None]:
linear_regressor.train(input_fn = lambda:my_input_fn(input, targets),steps=100)

In [None]:
# Making predictions
prediction_input_fn =lambda: my_input_fn(input, targets, num_epochs=1, shuffle=False)
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])

In [None]:
# Mean Squared Error and Root Mean Squared Error
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Mean Squared Error : %0.3f" % mean_squared_error)
print("Root Mean Squared Error: %0.3f" % root_mean_squared_error)

Mean Squared Error : 56104716191.571
Root Mean Squared Error: 236864.341
