In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
# Project constants
LEARN_SIZE = 50
POLY_DEGREE = 2
LEARNING_RATE = 0.001
DISPLAY_STEP = 200
NUM_STEPS = 3000

In [None]:
# Immediate execution
tf.enable_eager_execution()
tfe = tf.contrib.eager

# Load dataset

From Google Machine Learning crash course

In [None]:
housing_df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=",")

Let's look over the dataset

In [None]:
housing_df.head()

In [None]:
housing_df.describe()

It seems reasonable to evaluate also the dataset features. The dataset is very abundant, so we may limit the data only for the evaluation purposes.

In [None]:
housing_df = housing_df[:LEARN_SIZE]

It is a reasonable task to start from the data check. Is all data numerical? Does it contain missed values? Pandas can help us with a handy check.

In [None]:
print('Shape before missing and corrupt values check: ', housing_df.shape)

# Force number conversion
for col in list(housing_df):
    housing_df[col] = pd.to_numeric(housing_df[col], errors='coerce')

# Remove intrinsic and resulted NaN values
housing_df.dropna(inplace=True)

print('Shape after missing and corrupt values check: ', housing_df.shape)

So, all values are appropriate.

We can continue with the Tensorflow data processing now. We may prepare all source dataset columns (numeric) as Feature Columns.

In [None]:
feature_columns = [tf.feature_column.numeric_column(c) for c in list(housing_df)]
features = [{c: tf.convert_to_tensor(housing_df[c])} for c in list(housing_df)]
nets = [tf.feature_column.input_layer(features[i], feature_columns[i]) for i in range(len(feature_columns))]

Some numeric values perform better in the categorized form. For example, we can convert 'Housing median age' field into bucketing form. Such operation results in the one-hot-encoded matrix with the parameters separated into predefined bins.

In [None]:
age_index = list(housing_df).index('housing_median_age')

bucketized_column = tf.feature_column.bucketized_column(
    source_column = feature_columns[age_index],
    boundaries = [10, 20, 30, 40])

net_age_bucket = tf.feature_column.input_layer(features[age_index], bucketized_column)
print(net_age_bucket)

Also a more condensed representation would help in our further calculations. This one-hot-decoding gives us a flat category (rank of current bin).

In [None]:
bucket_index = tf.reshape(tf.cast(tf.argmax(net_age_bucket, axis=1), tf.float32), [-1, 1])
print(bucket_index)

Tensorflow helps us to make a direct conversion of feature data during the import. For example, we obtain a log transformed data directly.

In [None]:
def log_transformer(x):
    return tf.cast(tf.log(x), dtype=tf.float32)

log_feature_column_age = tf.feature_column.numeric_column('housing_median_age', normalizer_fn=log_transformer)
net_age_log = tf.feature_column.input_layer(features[age_index], log_feature_column_age)
net_age_log

Data conversion may include a popular polynomial conversion. We will get the polynomially transformed (squared) data in the next cell.

In [None]:
def poly_transformer(x):
    return tf.cast(tf.pow(x, POLY_DEGREE), dtype=tf.float32)

poly_feature_column_age = tf.feature_column.numeric_column('housing_median_age', normalizer_fn=poly_transformer)
net_age_poly = tf.feature_column.input_layer(features[age_index], poly_feature_column_age)
net_age_poly

Normalization is one of the most popular data transformation methods. Tensorflow helps us to prepare a feature column with zero mean and uniform standard deviation easily.

In [None]:
val_mean = housing_df['housing_median_age'].mean()
val_std = housing_df['housing_median_age'].std()

def scaler(x):
    return (tf.cast(x, dtype=tf.float32) - val_mean) / val_std

scale_feature_column_age = tf.feature_column.numeric_column('housing_median_age', normalizer_fn=scaler)
net_age_scale = tf.feature_column.input_layer(features[age_index], scale_feature_column_age)
net_age_scale

Need to mention also more complex data processing methods like feature crossing. They could help us to combine similar variables like longitude and latitude for a further simultaneous processing.

In [None]:
def get_quantile_based_boundaries(feature_values, num_buckets):
    boundaries = np.arange(1.0, num_buckets) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    return [quantiles[q] for q in quantiles.keys()]

longitude_bucket_feature_column = tf.feature_column.bucketized_column(
    tf.feature_column.numeric_column('longitude'),
    boundaries=get_quantile_based_boundaries(housing_df['longitude'], 10))

latitude_bucket_feature_column = tf.feature_column.bucketized_column(
    tf.feature_column.numeric_column('latitude'),
    boundaries=get_quantile_based_boundaries(housing_df['latitude'], 10))

crossed_lat_lon_feature_column = tf.feature_column.crossed_column(
    [longitude_bucket_feature_column, latitude_bucket_feature_column], 50)

net_longitude = tf.feature_column.input_layer(features[0], longitude_bucket_feature_column)
net_latitude = tf.feature_column.input_layer(features[1], latitude_bucket_feature_column)

A detailed dataset evaluation may include grouping of some features. What if we want to estimate average dataset values related to the same housing mean age?

In [None]:
housing_df.groupby(['housing_median_age']).mean()

Otherwise we may change the dataset representation by preparing the moving average to smooth possible divagations of variable variability.

In [None]:
housing_df_sorted = housing_df.sort_values('housing_median_age')
housing_df_sorted['h_m_age_rolling'] = housing_df_sorted['housing_median_age'].rolling(5, min_periods=1).mean()
housing_df_sorted

Let's make some tests with our processed data. Does the feature processing actually improves the prediction?

First we need to define a really simple linear regression using Tensorflow.

In [None]:
# Definition of linear equation
a = tfe.Variable(np.random.randn())
b = tfe.Variable(np.random.randn())

def linear_regression(inputs):
    return inputs * a + b

In [None]:
# Regression objective as minimization of error
def mean_square_fn(model_fn, inputs, labels):
    n_samples = int(tf.size(labels))
    return tf.reduce_sum(tf.pow(model_fn(inputs) - labels, 2)) / (2 * n_samples)

In [None]:
# Training optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
grad = tfe.implicit_gradients(mean_square_fn)

In [None]:
# Main regression routine
def make_regression(x, y):
    for step in range(NUM_STEPS):
        optimizer.apply_gradients(grad(linear_regression, x, y))
        if (step + 1) % DISPLAY_STEP == 0 or step == 0:
            print("Epoch:", '%04d' % (step + 1), "cost=",
                  "{:.9f}".format(mean_square_fn(linear_regression, x, y)),
                  "a=", a.numpy(), "b=", b.numpy())
    return a, b

In [None]:
# Plot results
def make_plot(x, y):
    plt.plot(x, y, 'ro', label='Original')
    plt.plot(x, np.array(a * x + b), label='Fitted')
    plt.legend()
    plt.show()

What if we want to estimate median income using the source median housing age?

In [None]:
income_index = list(housing_df).index('median_income')

a, b = make_regression(nets[age_index], nets[income_index])
make_plot(nets[age_index], nets[income_index])

Scaling is also a good try to improve the accuracy, but unfortunately not in this case...

In [None]:
a, b = make_regression(net_age_scale, nets[income_index])
make_plot(net_age_scale, nets[income_index])

Bit of uniformity seems also not impressive in this case.

In [None]:
a, b = make_regression(bucket_index, nets[income_index])
make_plot(bucket_index, nets[income_index])