# training-data-analyst/courses/machine_learning/deepdive2/structured

1a_explore_data_babyweight
1b_prepare_data_babyweight
4b_keras_dnn_babyweight
4b_keras_dnn_babyweight

### Creating BigQuery dataset titled: babyweight

In [None]:
DATASET_NAME="babyweight"

bq --location=US mk --dataset \
        --description "description" \
        $PROJECT:${DATASET_NAME}

### Create GCS bucket

In [None]:
gsutil mb -l ${REGION} gs://${BUCKET}

### Retrieve GCS bucket files

In [None]:
gsutil ls

### Query to get all column names within table schema

In [None]:
%%bigquery
SELECT
    column_name
FROM
    publicdata.samples.INFORMATION_SCHEMA.COLUMNS
WHERE
    table_name = "natality"

### Create dataset table

In [None]:
CREATE OR REPLACE TABLE {NEW_TABLE_NAME}
AS SELECT {COLUMNS} FROM ... WHERE ...

### Create hash column using year and month in Bigquery

In [None]:
SELECT FARM_FINGERPRINT(CONCAT(
        CAST(year AS STRING),
        CAST(month AS STRING)) AS hashmonth
FROM ...

### Augment dataset to simulate missing data
만약 예측 시점에 특정될 수도 있고 특정되지 않을 수도 있는 데이터는 두 개 case 모두를 이용하여 data augment 할 수 있다.

*예: 초음파 검사를 하는 경우 알 수 있는 쌍둥이 여부, 성별을 unknown 데이터로 변경*

### Split augmented dataset into train and eval sets

In [None]:
%%bigquery
CREATE OR REPLACE TABLE
    ...data_train AS
SELECT
    ...
FROM
    ...
WHERE
    MOD(hashmonth, 4) < 3  # hash column 값을 나눈 나머지로 split. 3/4
    

CREATE OR REPLACE TABLE
    ...data_evel AS
SELECT
    ...
FROM
    ...
WHERE
    MOD(hashmonth, 4) = 3  # hash column 값을 나눈 나머지로 split. 1/4

    

### convert bigquery result to dataframe 

In [None]:
# Call BigQuery and examine in dataframe
df = bigquery.Client().query(query).to_dataframe()
df.head()

# sort dataframe with coloumn name
df.sort_values(column_name)

### data plotting을 통해서 유효한 data를 확인할 필요가 있다

In [None]:
# plot dataframe
df.plot(x=column_name, y="num_babies", kind="bar", figsize=(12, 5))

### Export from BigQuery to CSVs in GCS

In [None]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# bigquery에 생성한 dataset name
dataset_name = 'babyweight'

# Create dataset reference object
dataset_ref = client.dataset(
    dataset_id=dataset_name, project=client.project)

table_ref = dataset_ref.table(table_name)
extract_job = client.extract_table(
    table_ref,
    # google storage url
    # ex) os.path.join("gs://", BUCKET, dataset_name, "data", "{}*.csv".format(step))
    destination_uri,
    # Location must match that of the source table.
    location="US",
)  # API request

extract_job.result() # Waits for job to complete.


### Verify CSV creation

In [None]:
%%bash
gsutil ls gs://${BUCKET}/babyweight/data/*.csv

#### load_dataset

In [None]:
def features_and_labels(row_data):
    """Splits features and labels from feature dictionary.

    Args:
        row_data: Dictionary of CSV column names and tensor values.
    Returns:
        Dictionary of feature tensors and label tensor.
    """
    label = row_data.pop(LABEL_COLUMN)

    return row_data, label  # features, label


from tf.data.experimental import make_csv_dataset

# Make a CSV dataset. 
# https://www.tensorflow.org/api_docs/python/tf/data/experimental/make_csv_dataset
dataset = make_csv_dataset(pattern,  # path of csv files
                           batch_size,  # the number of records to combine in a single batch
                           CSV_COLUMNS,  # the keys of the features dict of each dataset element
                           DEFAULTS)  # default values for the CSV fields

# Map dataset to features and label
dataset = dataset.map(features_and_labels)  # features, label

# Shuffle and repeat for training
# shuffle: buffer_size = the number of elements from this dataset
# repeat: if count -1 or None then the dataset be repeated indefinitely
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset
if mode == 'train':
    dataset = dataset.shuffle(buffer_size=1000).repeat()

# Take advantage of multi-threading; 1=AUTOTUNE
# 백그라운드 스레드와 내부 버퍼를 사용하여 요청된 시간 전에 입력 데이터셋에서 요소를 가져옵니다
# buffer_size: the maximum number of elements that will be buffered
# buffer_size=1 is tf.data.AUTOTUNE. It means the buffer size is dynamically tuned
# https://www.tensorflow.org/guide/data_performance
dataset = dataset.prefetch(buffer_size=1)


### encoding one-hot vector

In [None]:
# encoding one-hot vector
# https://www.tensorflow.org/tutorials/structured_data/feature_columns?hl=ko

# 모델에 문자열을 바로 주입할 수 없습니다. 대신 문자열을 먼저 수치형으로 매핑해야 합니다. 
# 범주형 열(categorical column)을 사용하여 문자열을 원-핫 벡터로 표현할 수 있습니다
# 가능한 문자열이 몇 개로 제한될 때
def get_categorical(name, voc_list):
    cat = tf.feature_column.categorical_column_with_vocabulary_list(name, voc_list)
    return tf.feature_column.indicator_column(cat)

# 가능한 문자열이 몇 개가 있는 것이 아니라 범주마다 수천 개 이상의 값이 있는 경우
# dimension = dimension of the embedding
def get_embedding_column(name, voc_list):
    cat = tf.feature_column.categorical_column_with_vocabulary_list(name, voc_list)
    return tf.feature_column.embedding_column(cat, dimension=len(voc_list))

### tensorflow.feature_column

In [None]:
# https://www.tensorflow.org/tutorials/structured_data/feature_columns

feature_columns = []

# 수치형 열: 데이터프레임 열의 값을 변형시키지 않고 그대로 전달  
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
  feature_columns.append(feature_column.numeric_column(header))

# 버킷형 열: 수치 값의 구간을 나누어 이를 기반으로 범주형 one-hot encoding으로 변환
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# 범주형 열: 문자열을 먼저 수치형으로 매핑
# 범주형 열(categorical column)을 사용하여 문자열을 원-핫 벡터로 표현
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# 임베딩 열
# 고차원 원-핫 벡터로 데이터를 표현하는 대신 임베딩 열을 사용하여 저차원으로 데이터를 표현합니다. 
# 이 벡터는 0 또는 1이 아니라 각 원소에 어떤 숫자도 넣을 수 있는 밀집 벡터(dense vector)입니다
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# 교차 특성 열: 여러 특성을 연결하여 하나의 특성으로 만드는 것
# numeric column은 사용 불가
# All keys must be either string, or categorical column except HashedCategoricalColumn
# 모델이 특성의 조합에 대한 가중치를 학습
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [6]:
### rmse 구하기
rmse = lambda y_true, y_pred: tf.sqrt(tf.reduce_mean((y_true - y_pred)**2))

### Steps for building model
1. Create input layers
2. Create feature columns
3. Build model and compile it all together
  - 최종 dense layer의 activation은 linear, 그 외 dense layer는 relu

### feature_columns to input layer

In [None]:
# A layer that produces a dense Tensor
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/DenseFeatures
tf.keras.layers.DenseFeatures(
    feature_columns, # numeric_column, embedding_column, bucketized_column, indicator_column. 
    # If you have categorical features, you can wrap them with an embedding_column or indicator_column.
    name
)(inputs)

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit
model.fit(
    trainds,  # tuple(features, labels)
    validation_data=evalds,  # tuple(features, labels)
    epochs=NUM_EVALS,  # on epcoh end, check validation. so epochs equals to evals.
    batch_size,  # If unspecified, batch_size will default to 32
    steps_per_epoch=steps_per_epoch,  # Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch
    callbacks=[tensorboard_callback])