In [None]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
from sklearn import metrics
import seaborn as sns
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 90
pd.options.display.float_format = '{:.1f}'.format

train_df = pd.read_csv("./train.csv", sep=",")

test_df = pd.read_csv("./test.csv", sep=",")

train_df = train_df.reindex(np.random.permutation(train_df.index))

In [None]:
display.display(train_df.head())
display.display(train_df.describe())

In [None]:
print (train_df.isnull().sum())
display.display(train_df.info())

In [None]:
train_df['SalePrice'].describe()

In [None]:
train_df['YearBuilt'].hist()
train_df['YearBuilt'].describe()
test_df['YearBuilt'].describe()

In [None]:
sns.distplot(train_df['SalePrice']);

In [None]:
sns.distplot(np.log(train_df['SalePrice']));

In [None]:
print("Skewness: %f" % train_df['SalePrice'].skew())
print("Kurtosis: %f" % train_df['SalePrice'].kurt())

In [None]:
train_df.corr()

In [None]:
def preprocess_features(df):
  selected_features = df[
    ["OverallQual",
     "GrLivArea",
     "YearBuilt"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
#   processed_features["rooms_per_person"] = (
#     california_housing_dataframe["total_rooms"] /
#     california_housing_dataframe["population"])
  return processed_features

def preprocess_targets(df):
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
#   output_targets["SalePrice"] = (
#     df["SalePrice"] / 1000.0)
  output_targets["SalePrice"] = (
    np.log(df['SalePrice']))
  return output_targets

In [None]:
# Choose the first 1022 (out of 1460) examples for training.
training_examples = preprocess_features(train_df.head(1022))
training_targets = preprocess_targets(train_df.head(1022))

# Choose the last 438 (out of 1460) examples for validation.
validation_examples = preprocess_features(train_df.tail(438))
validation_targets = preprocess_targets(train_df.tail(438))

# Double-check that we've done the right thing.
# print("Training examples summary:")
# display.display(training_examples.describe())
# print("Validation examples summary:")
# display.display(validation_examples.describe())

# print("Training targets summary:")
# display.display(training_targets.describe())
# print("Validation targets summary:")
# display.display(validation_targets.describe())

In [None]:
OverallQual_numeric = tf.feature_column.numeric_column(key='OverallQual')
GrLivArea_numeric = tf.feature_column.numeric_column(key='GrLivArea')
YearBuilt_numeric = tf.feature_column.numeric_column(key='YearBuilt')
YearBuilt_feature = tf.feature_column.bucketized_column(source_column=YearBuilt_numeric, boundaries=[1880, 1890, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020])

def construct_feature_columns():
    return ([OverallQual_numeric, GrLivArea_numeric, YearBuilt_feature])

In [None]:
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
 
    ds = Dataset.from_tensor_slices((features,targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
      ds = ds.shuffle(1022)
    
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

In [None]:
def train_linear_regressor_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):

  periods = 10
  steps_per_period = steps / periods

  # Create a linear regressor object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  linear_regressor = tf.estimator.LinearRegressor(
      feature_columns=construct_feature_columns(),
      optimizer=my_optimizer
  )
    
  # Create input functions.
  training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets["SalePrice"], 
                                          batch_size=batch_size)
  predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets["SalePrice"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
  predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets["SalePrice"], 
                                                    num_epochs=1, 
                                                    shuffle=False)

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("RMSE (on training data):")
  training_rmse = []
  validation_rmse = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.train(
        input_fn=training_input_fn,
        steps=steps_per_period,
    )
    # Take a break and compute predictions.
    training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
    training_predictions = np.array([item['predictions'][0] for item in training_predictions])
    
    validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
    validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
    
    # Compute training and validation loss.
    training_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(training_predictions, training_targets))
    validation_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(validation_predictions, validation_targets))
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
    # Add the loss metrics from this period to our list.
    training_rmse.append(training_root_mean_squared_error)
    validation_rmse.append(validation_root_mean_squared_error)
  print("Model training finished.")

  
  # Output a graph of loss metrics over periods.
  plt.ylabel("RMSE")
  plt.xlabel("Periods")
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.plot(validation_rmse, label="validation")
  plt.legend()

  return linear_regressor

In [None]:
linear_regressor = train_linear_regressor_model(
    learning_rate=0.0001,
    steps=5000,
    batch_size=10,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                  validation_targets["SalePrice"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
evaluation_metrics = linear_regressor.evaluate(input_fn=predict_validation_input_fn)

print(evaluation_metrics)
# print("AUC on the validation set: %0.2f" % evaluation_metrics['auc'])
# print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy'])