# Logistic Regression with TensorFlow2

**Objectives:**
  * Define the median house value predictor (from the preceding exercises) as a binary classification model

The [data](https://developers.google.com/machine-learning/crash-course/california-housing-data-description) is based on 1990 census data from California.

## Setup
In this first cell, we'll load the necessary libraries.

In [1]:
import math
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
if tf.__version__[0] == "1":
    !pip install tensorflow==2.0.0-alpha0
print(tf.__version__)

2.0.0-alpha0


In [40]:
data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
data.shape

(17000, 9)

In [41]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [42]:
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Preprocess

In [0]:
data["rooms_per_person"] = data["total_rooms"] / data["population"]
data["median_house_value_is_high"] = (
    data["median_house_value"] > 265000).astype(float)

## Split dataset

In [46]:
data = data.sample(frac=1).reset_index(drop=True)
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person,median_house_value_is_high
0,-121.9,37.28,34.0,4613.0,749.0,2050.0,725.0,5.3922,302900.0,2.250244,1.0
1,-120.57,35.11,18.0,2920.0,556.0,1068.0,552.0,3.5242,156800.0,2.734082,0.0
2,-117.26,34.15,33.0,2271.0,389.0,1100.0,380.0,3.5978,88300.0,2.064545,0.0
3,-118.18,33.98,24.0,1880.0,642.0,2646.0,605.0,2.1836,162000.0,0.710506,0.0
4,-118.39,33.97,44.0,1097.0,186.0,513.0,185.0,6.235,361400.0,2.138402,1.0


In [47]:
train_df = data.loc[:12000, :]
train_df.shape

(12001, 11)

In [48]:
validation_df = data.loc[12000:, :]
validation_df.shape

(5000, 11)

In [0]:
CATEGORICAL_COLUMNS = []
NUMERIC_COLUMNS = [
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "rooms_per_person"
]

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = train_df[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [0]:
def make_input_fn(data_df, label_df, num_epochs, shuffle, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

## Training

In [0]:
train_targets = train_df.loc[:, "median_house_value_is_high"]
train_features = train_df.drop(["median_house_value", "median_house_value_is_high"], axis=1)
train_input_fn = make_input_fn(train_features, train_targets, num_epochs=10, shuffle=True)

In [52]:
linear_classif = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_classif.train(train_input_fn, steps=100)
result = linear_classif.evaluate(train_input_fn)
print(result)

W0503 22:16:41.479200 140621165668224 estimator.py:1799] Using temporary folder as model directory: /tmp/tmpupu_xa2v
Exception ignored in: <generator object EstimatorV2.predict at 0x7fe4acf37a98>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 640, in predict
    for key, value in six.iteritems(preds_evaluated)
  File "/usr/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 5486, in get_controller
    yield g
  File "/usr/lib/python3.6/contextlib.py", line 99, in __exit__
    self.gen.throw(type, value, traceback)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 5301, in get_controller
    type(default))
AssertionError: Nesting violated for default stack of <class 'tensorflow.python.framework.ops.Graph'> objects


{'accuracy': 0.38946754, 'accuracy_baseline': 0.74910426, 'auc': 0.599224, 'auc_precision_recall': 0.29563883, 'average_loss': 32.82614, 'label/mean': 0.25089577, 'loss': 32.814045, 'precision': 0.287849, 'prediction/mean': 0.84776974, 'recall': 0.9724344, 'global_step': 100}


In [53]:
train_predictions = linear_classif.predict(make_input_fn(
    train_features, 
    train_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
train_probs = [i["probabilities"][1] for i in train_predictions]
roc_area = metrics.roc_auc_score(train_targets, train_probs)
print("Area under ROC: {}".format(roc_area))

Area under ROC: 0.6367372286044977


## Validation

In [56]:
validation_targets = validation_df.loc[:, "median_house_value_is_high"]
validation_features = validation_df.drop(["median_house_value", "median_house_value_is_high"], axis=1)
validation_predictions = linear_classif.predict(make_input_fn(
    validation_features, 
    validation_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
validation_probs = [i["probabilities"][1] for i in validation_predictions]
roc_area = metrics.roc_auc_score(validation_targets, validation_probs)
print("Area under ROC: {}".format(roc_area))

Area under ROC: 0.6498431744787099


## Testing

In [57]:
test_data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_test.csv", sep=",")
test_data.shape

(3000, 9)

In [58]:
test_data["rooms_per_person"] = test_data["total_rooms"] / test_data["population"]
test_data["median_house_value_is_high"] = (
    test_data["median_house_value"] > 265000).astype(float)
test_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_person,median_house_value_is_high
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205846.275,1.96632,0.248667
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113119.68747,1.09993,0.432312
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22500.0,0.035955,0.0
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121200.0,1.513472,0.0
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177650.0,1.922678,0.0
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263975.0,2.294729,0.0
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500001.0,33.843373,1.0


In [59]:
test_targets = test_data.loc[:, "median_house_value_is_high"]
test_features = test_data.drop(["median_house_value", "median_house_value_is_high"], axis=1)
test_predictions = linear_classif.predict(make_input_fn(
    test_features, 
    test_targets, 
    num_epochs=1, 
    shuffle=False)) # generator
test_probs = [i["probabilities"][1] for i in test_predictions]
roc_area = metrics.roc_auc_score(test_targets, test_probs)
print("Area under ROC: {}".format(roc_area))

Area under ROC: 0.6448075033720215
