## 使用tf.data生成csv 

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0-alpha0
sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)
matplotlib 3.1.0
numpy 1.14.5
pandas 0.24.2
sklearn 0.21.2
tensorflow 2.0.0-alpha0
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
from sklearn.model_selection import train_test_split

# 默认划分比例为3:1
X_train_all, X_test, y_train_all, y_test = train_test_split(housing.data, housing.target,
                                                           random_state=7)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all,
                                                     random_state=11)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

### 将房价数据集保存到csv文件中

In [6]:
output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    """
    data:全部数据，包括features和label
    name_prefix:用来区分train、test、valid
    header:csv文件中header
    n_parts:表示该数据集分成几个csv文件存储
    """
    # 先确定每个csv文件路径
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    # 循环中首先将所有数据行分成n_parts组，然后对每个组取出他们在原data中的indexes
    for file_idx, row_indexes in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        part_csv_path = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv_path)
        with open(part_csv_path, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n')
            for row_index in row_indexes:
                f.write(','.join(
                    [repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

# 组装数据，将features和label拼接到一块
train_data = np.c_[X_train_scaled, y_train]
valid_data = np.c_[X_valid_scaled, y_valid]
test_data = np.c_[X_test_scaled, y_test]

# 构建csv的header
header_cols = housing.feature_names + ['MidianHouseValue']
# 将list连接成字符串
header_str = ','.join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, 'train',
                              header_str, n_parts=20)
test_filenames = save_to_csv(output_dir, test_data, 'test',
                             header_str, n_parts=10)
valid_filenames = save_to_csv(output_dir, valid_data, 'valid',
                              header_str, n_parts=10)

In [7]:
import pprint
print("train_filenames:")
pprint.pprint(train_filenames)
print("test_filenames:")
pprint.pprint(test_filenames)
print("valid_filenames:")
pprint.pprint(valid_filenames)

train_filenames:
['generate_csv/train_00.csv',
 'generate_csv/train_01.csv',
 'generate_csv/train_02.csv',
 'generate_csv/train_03.csv',
 'generate_csv/train_04.csv',
 'generate_csv/train_05.csv',
 'generate_csv/train_06.csv',
 'generate_csv/train_07.csv',
 'generate_csv/train_08.csv',
 'generate_csv/train_09.csv',
 'generate_csv/train_10.csv',
 'generate_csv/train_11.csv',
 'generate_csv/train_12.csv',
 'generate_csv/train_13.csv',
 'generate_csv/train_14.csv',
 'generate_csv/train_15.csv',
 'generate_csv/train_16.csv',
 'generate_csv/train_17.csv',
 'generate_csv/train_18.csv',
 'generate_csv/train_19.csv']
test_filenames:
['generate_csv/test_00.csv',
 'generate_csv/test_01.csv',
 'generate_csv/test_02.csv',
 'generate_csv/test_03.csv',
 'generate_csv/test_04.csv',
 'generate_csv/test_05.csv',
 'generate_csv/test_06.csv',
 'generate_csv/test_07.csv',
 'generate_csv/test_08.csv',
 'generate_csv/test_09.csv']
valid_filenames:
['generate_csv/valid_00.csv',
 'generate_csv/valid_01.csv',
