In [21]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

In [35]:
# create a dataset entirely in RAM
X= tf.range(10)
X

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>

In [46]:
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset
# from_tensor_slices(X) takes elements all from X and pass it to tf.data.Dataset
# to create a dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [37]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [38]:
repeated_dataset = dataset.repeat(3)
for item in repeated_dataset:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype

In [39]:
batch = dataset.batch(7)
for item in batch:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9], shape=(3,), dtype=int32)


In [40]:
batch = dataset.batch(7, drop_remainder=True)
for item in batch:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)


In [34]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)
# repeat(n) method returns a new dataset that will repeat the items of the original
# dataset n times

#batch(n) method creates a new dataset group the items of the previous dataset
# in batches of n items

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [42]:
# transform the items by calling the map() method
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset = dataset.map(lambda x:x*2)
for item in dataset:
    print(item)
# this function could be used for preprocessing

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


In [44]:
# num_parallel_calls help spawn multiple threads
dataset = tf.data.Dataset.from_tensor_slices(X)
def preprocess(x):
    return x * 2
processed_dataset = dataset.map(preprocess, 
                                num_parallel_calls=tf.data.experimental.AUTOTUNE)
# tf.data.experimental.AUTOTUNE を使用すると自動的に最適なスレッド数を選択する
for item in processed_dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)


In [51]:
# applt() method applies a transformation to the dataset as a whole
Xn= tf.Variable([[1.,2.,3.],[1.,2.,3.]])
dataset = tf.data.Dataset.from_tensor_slices(Xn)
dataset = dataset.apply(tf.data.experimental.unbatch())
for item in dataset:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)

tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(3.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(3.0, shape=(), dtype=float32)


In [53]:
# with a filter(lambda x: ...) method, it is also possible to filter the dataset
dataset = dataset.filter(lambda x:x<5)
for item in dataset:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [54]:
for item in dataset.take(3):
    print(item)
# take limits the indices to be shown 

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)


In [59]:
#Shuffling the Data
# buffer_size hyperparameter determines the size of data to be taken out of the dataset
# at one time, setting it larger is preferable but may encounter RAM exceeding
dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=5, seed=42, reshuffle_each_iteration=False).batch(7)
for item in dataset:
    print(item)
# reshuffle_each_iteration = False makes sure that a shuffled dataset keeps its order
# when repeat() method is called on it

tf.Tensor([2 5 1 7 0 8 9], shape=(7,), dtype=int64)
tf.Tensor([0 4 3 1 5 6 4], shape=(7,), dtype=int64)
tf.Tensor([6 7 2 9 0 3 3], shape=(7,), dtype=int64)
tf.Tensor([2 8 4 6 8 9 7], shape=(7,), dtype=int64)
tf.Tensor([5 1], shape=(2,), dtype=int64)


In [63]:
# interleaving lines from multiple files
# P 536-537
import pandas as pd

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [71]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data,
                                                              housing.target.reshape(-1,1),
                                                              random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full,
                                                      y_train_full,
                                                      random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_
# split into multiple files (especially for big datasets)

In [84]:
len(train_data)

11610

In [74]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")  # ファイルを保存するディレクトリのパスを指定します
    os.makedirs(housing_dir, exist_ok=True)  # housing_dirで指定されたディレクトリを作成します（既に存在している場合はスキップします）
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")  # ファイルのパスのフォーマットを指定します

    filepaths = []  # 保存された各ファイルのパスを格納するリストを初期化します
    m = len(data)  # データの総数を取得します
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        # データをn_partsで指定された数に分割し、各パートごとに処理を行います
        part_csv = path_format.format(name_prefix, file_idx)  # パートごとのCSVファイルのパスを生成します
        filepaths.append(part_csv)  # パートごとのCSVファイルのパスをリストに追加します
        with open(part_csv, "wt", encoding="utf-8") as f:
            # パートごとのCSVファイルを書き込みモードで開きます
            if header is not None:
                f.write(header)  # ヘッダー行を書き込みます
                f.write("\n")
            for row_idx in row_indices:
                # 各パートの行のインデックスに対応するデータをCSVファイルに書き込みます
                f.write(",".join([repr(col) for col in data[row_idx]]))
                #repr()関数は、オブジェクトを表す文字列を返します。
                # joinでリスト内の要素をカンマで結合して一つの文字列にする
                # example [1,2,3,4]→"1,2,3,4"
                f.write("\n") #改行文字
    return filepaths  # 保存された各ファイルのパスのリストを返します
# open の "wt"は
#"w": 書き込みモードを指定します。ファイルが存在する場合は、既存の内容を削除して
# 新しい内容を書き込みます。存在しない場合は、新しいファイルを作成して書き込みます。

#"t": テキストモードを指定します。テキストモードでは、ファイルをテキストとして
#扱います。これにより、テキストデータをエンコード・デコードする機能が有効になります。

In [75]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [77]:
tewt = pd.read_csv(os.path.join("datasets","housing","my_test_00.csv"))
tewt.head(3)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01,0.477
1,2.5313,30.0,5.039384,1.193493,1565.0,2.679795,35.14,-119.46,0.458
2,3.4801,52.0,3.977155,1.185877,1310.0,1.360332,37.8,-122.44,5.00001


In [78]:
with open(train_filepaths[0]) as f:
    for i in range(5):
        print(f.readline(), end="")
#書き込んだファイルの様子
# , で区切ってexcelで見れる

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621
7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621


In [86]:
train_filepaths[:3]

['datasets\\housing\\my_train_00.csv',
 'datasets\\housing\\my_train_01.csv',
 'datasets\\housing\\my_train_02.csv']

In [87]:
(np.array_split(np.arange(100), 20))[:3] 
# np.arange(100) を 20の len5 のarrayに分割

[array([0, 1, 2, 3, 4]), array([5, 6, 7, 8, 9]), array([10, 11, 12, 13, 14])]

In [88]:
# interleave steps
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)
# list_files returns a dataset that shuffles the file paths
# set shuffle = False ,stops this

# interleave the lines with five files at a time
n_readers = 5
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                      cycle_length = n_readers)
#.skip(1) skips the first line because those are titles

# .interleave method create a dataset that will pull five file paths from the
# filepath_dataset

for line in dataset.take(5):
    print(line.numpy())
# these are the first rows( hearder ignored) of five CSV files, chosen randomly
# still have to be scaled and parsed

b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782'
b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'
b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625'
b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526'
b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442'


In [97]:
n_inputs = 8 # X_train.shape[-1]

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1]) # stack these tensors into a 1D array without the target
    y = tf.stack(fields[-1:]) # stach the target
    return (x - X_mean) / X_std, y
# tf.io.decode_csv(a , b)
# a is the line to parse
# b is an array containing the default value for each column in the CSV file
# b tells tf.io.decode_csv tells the number of columns, types and default values

In [98]:
choke = [0.] * 8 + [tf.constant([], dtype=tf.float32)]
choke
# +[tf.constant([], dtype=ft.float32)] turns [0.] * 8 to a tensor

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 <tf.Tensor: shape=(0,), dtype=float32, numpy=array([], dtype=float32)>]

In [99]:
choke = [0.] * 8 
choke

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [105]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')
# bはバイト文字列を意味する　データ形を宣言している

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

In [108]:
# full preprocessing steps
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5 , batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                           cycle_length = n_readers, num_parallel_calls = n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls = n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)
# interpretation of this code P 541
# about prefetch , multithreaded loading and preprocessing
# P542

In [139]:
test_tt = tf.data.Dataset.list_files(test_filepaths)
test_tt = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
                                           cycle_length = n_readers, num_parallel_calls = n_read_threads)
for item in test_tt:
    print(item)

tf.Tensor(b'datasets\\housing\\my_test_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_06.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_04.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_03.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_08.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_test_09.csv', shape=(), dtype=string)


In [110]:
# use the dataset with tf.keras
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [112]:
X_train.shape

(11610, 8)

In [114]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)
model = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
    
])

In [115]:
model.compile(optimizer=keras.optimizers.SGD(learning_rate=1e-3), loss="mse")
batch_size=32
model.fit(train_set, steps_per_epoch = len(X_train)// batch_size, epochs=10,
          validation_data = valid_set)
#steps_per_epochは一ステップの計算数

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22448fd3a00>

In [116]:
model.evaluate(test_set, steps=len(X_test)//batch_size)



0.45471519231796265

In [136]:
new_set = test_set.take(3).map(lambda X,y: X) #pretend we have new 3 instances
# lambda関数は test_setのlabel部分を取り除くのに使う
model.predict(new_set).shape
# batch_size 32 * 3 = 96



(96, 1)

In [130]:
new_set = test_set.take(3)
# kerasは自動的にlabelを無視する
model.predict(new_set)[:3]



array([[1.9144926],
       [2.387704 ],
       [1.0466435]], dtype=float32)