<a href="https://colab.research.google.com/github/allenxu-passion/Tensorflow_LS/blob/master/tf_study_official_firststeps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
"""
官方示例，step by step
https://colab.research.google.com/notebooks/mlcc/first_steps_with_tensor_flow.ipynb
"""


from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format


## Firstly 数据加载和处理
# 使用pandas加载数据源，同时处理数据，按照设定：保留一位小数点四舍五入，打印时打印10行（首5行尾5行）
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.cn/mledu-datasets/california_housing_train.csv", sep=",")
# 随机排序
california_housing_dataframe = california_housing_dataframe.reindex(np.random.permutation(california_housing_dataframe.index))
california_housing_dataframe["median_house_value"] /= 1000.0
# 输出数据的统计信息
california_housing_dataframe.describe()


## Secondly 数据抽象
# 使用total_rooms作为输入特征，使用median_hosue_value作为目标
# 预导入到tensorflow，需指定输入特征的数据类型（如数值数据、分类数据）。使用tf的特征列来指定，其存储的是描述，不包含数据本身
input_feature = california_housing_dataframe[["total_rooms"]] #输入特征
input_columns = [tf.feature_column.numeric_column("total_rooms")] #利用tf的特征列，指定数据类型为数值数据
output_target = california_housing_dataframe["median_house_value"] #目标


## 建立模型
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001) #小批量随机梯度下降法，参数为步长
optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0) #梯度裁剪，防止梯度大小变得过大
regressor = tf.estimator.LinearRegressor(feature_columns=input_columns, optimizer=optimizer) #线性回归模型


## 数据传递层（抽象数据到模型之间）
# 数据和模型之间，定义一个中间层（数据传递），负责tf的数据预处理、批处理、随机处理、重复数据处理
def input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
  features = {key:np.array(value) for key,value in dict(features).items()} #pandas特征数据 转 np数组字典
  ds = Dataset.from_tensor_slices((features,targets)) #定义数据集并配置batching、repeating、shuffle
  ds = ds.batch(batch_size).repeat(num_epochs)
  if shuffle:
    ds = ds.shuffle(buffer_size=10000)
  # 返回下一批数据
  features, labels = ds.make_one_shot_iterator().get_next()
  return features, labels


## 训练模型
_ = regressor.train(input_fn = lambda:input_fn(input_feature, output_target), steps=100) #训练100步


## 分析
# 评估模型在训练数据上的拟合情况（~太差了~）
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value
print("Min. Median House Value: %0.3f" % min_house_value)
print("Max. Median House Value: %0.3f" % max_house_value)
print("Difference between Min. and Max.: %0.3f" % min_max_difference)

prediction_input_fn = lambda:input_fn(input_feature, output_target, num_epochs=1, shuffle=False)
predictions = regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
mean_squared_error = metrics.mean_squared_error(predictions,output_target) #预测的数据和预期的数据
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Min. Median House Predicted Value: %0.3f" % predictions.min())
print("Max. Median House Predicted Value: %0.3f" % predictions.max())
min_max_predicted_difference = predictions.max() - predictions.min()
print("Difference between Predicted Min. and Max.: %0.3f" % min_max_predicted_difference)
print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)

# 看下预测和预期数据的统计信息
calibration_data = pd.DataFrame()
calibration_data["predictions"] = pd.Series(predictions)
calibration_data["targets"] = pd.Series(output_target)
calibration_data.describe()

# 通过散点图形象看下
sample = california_housing_dataframe.sample(n=300) #取300个原始数据样本
x_0 = sample["total_rooms"].min()
x_1 = sample["total_rooms"].max()
weight = regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0] #训练后的模型weight
bias = regressor.get_variable_value('linear/linear_model/bias_weights') #训练后的模型bias
y_0 = weight * x_0 + bias 
y_1 = weight * x_1 + bias
plt.plot([x_0, x_1], [y_0, y_1], c='r') #由于是线性的模型，因而这里根据首尾两点绘制直线
plt.ylabel("median_house_value")
plt.xlabel("total_rooms")
plt.scatter(sample["total_rooms"], sample["median_house_value"])
plt.show()
