# Classification with Tensorflow Decision Forests

# Contents
## 1. Ensemble Tree 모델 리뷰
## 2. Census-Income 데이터를 이용하여 income-level 예측
## 3. Experiments (Keras)
###   3.1 Decision Forests with raw features
###   3.2 Decision Forests with target encoding
###   3.3 Decision Forests with trained embeddings

### Imports

In [1]:
import tensorflow as tf
import math
import urllib
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_decision_forests as tfdf

# GPU version 에서 오류가 있어서 cpu 에서 실행함.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

tfdf.__version__

2022-10-24 12:40:41.426295: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-24 12:40:42.182730: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-24 12:40:44.810716: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/dhlee/anaconda3/envs/p38/lib:{LD_LIBRARY_PATH}
2022-10-24 12:40:44.811120: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: l

'1.0.1'

### Prepare the data

In [2]:
'''
BASE_PATH 에서 header와 data(train.gz, test.gz) 를 다운로드함.
header 에 income_level 이라는 target_label 추가
'''

BASE_PATH = 'https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income'
url_name = f'{BASE_PATH}.names'
url_train_data = f'{BASE_PATH}.data.gz'
url_test_data = f'{BASE_PATH}.test.gz'

CSV_HEADER = [l.decode('utf-8').split(':')[0].replace(' ', '_') for l in urllib.request.urlopen(url_name) if not l.startswith(b'|')][2:]
print(len(CSV_HEADER), CSV_HEADER)

# target column 에 대한 헤더 추가
CSV_HEADER.append('income_level')

test_data = pd.read_csv(url_test_data, header=None, names=CSV_HEADER)
train_data = pd.read_csv(url_train_data, header=None, names=CSV_HEADER)

41 ['age', 'class_of_worker', 'detailed_industry_recode', 'detailed_occupation_recode', 'education', 'wage_per_hour', 'enroll_in_edu_inst_last_wk', 'marital_stat', 'major_industry_code', 'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union', 'reason_for_unemployment', 'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses', 'dividends_from_stocks', 'tax_filer_stat', 'region_of_previous_residence', 'state_of_previous_residence', 'detailed_household_and_family_stat', 'detailed_household_summary_in_household', 'instance_weight', 'migration_code-change_in_msa', 'migration_code-change_in_reg', 'migration_code-move_within_reg', 'live_in_this_house_1_year_ago', 'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer', 'family_members_under_18', 'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self', 'citizenship', 'own_business_or_self_employed', "fill_inc_questionnaire_for_veteran's_admin", 'veterans_benefits', 'w

In [3]:
train_data.describe()

Unnamed: 0,age,detailed_industry_recode,detailed_occupation_recode,wage_per_hour,capital_gains,capital_losses,dividends_from_stocks,instance_weight,num_persons_worked_for_employer,own_business_or_self_employed,veterans_benefits,weeks_worked_in_year,year
count,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0,199523.0
mean,34.494199,15.35232,11.306556,55.426908,434.71899,37.313788,197.529533,1740.380269,1.95618,0.175438,1.514833,23.174897,94.499672
std,22.310895,18.067129,14.454204,274.896454,4697.53128,271.896428,1984.163658,993.768156,2.365126,0.553694,0.851473,24.411488,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.87,0.0,0.0,0.0,0.0,94.0
25%,15.0,0.0,0.0,0.0,0.0,0.0,0.0,1061.615,0.0,0.0,2.0,0.0,94.0
50%,33.0,0.0,0.0,0.0,0.0,0.0,0.0,1618.31,1.0,0.0,2.0,8.0,94.0
75%,50.0,33.0,26.0,0.0,0.0,0.0,0.0,2188.61,4.0,0.0,2.0,52.0,95.0
max,90.0,51.0,46.0,9999.0,99999.0,4608.0,99999.0,18656.3,6.0,2.0,2.0,52.0,95.0


### Define dataset metadata

In [4]:
'''
Target 및 weight 열 지정
Numeric 과 Categorical feature 구분
'''

# Target column name.
TARGET_COLUMN_NAME = "income_level"
# The labels of the target columns.
TARGET_LABELS = [" - 50000.", " 50000+."]
# Weight column name.
WEIGHT_COLUMN_NAME = "instance_weight"
# Numeric feature names.
NUMERIC_FEATURE_NAMES = [
    "age",
    "wage_per_hour",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "num_persons_worked_for_employer",
    "weeks_worked_in_year",
]
# Categorical features and their vocabulary lists.
CATEGORICAL_FEATURE_NAMES = [
    "class_of_worker",
    "detailed_industry_recode",
    "detailed_occupation_recode",
    "education",
    "enroll_in_edu_inst_last_wk",
    "marital_stat",
    "major_industry_code",
    "major_occupation_code",
    "race",
    "hispanic_origin",
    "sex",
    "member_of_a_labor_union",
    "reason_for_unemployment",
    "full_or_part_time_employment_stat",
    "tax_filer_stat",
    "region_of_previous_residence",
    "state_of_previous_residence",
    "detailed_household_and_family_stat",
    "detailed_household_summary_in_household",
    "migration_code-change_in_msa",
    "migration_code-change_in_reg",
    "migration_code-move_within_reg",
    "live_in_this_house_1_year_ago",
    "migration_prev_res_in_sunbelt",
    "family_members_under_18",
    "country_of_birth_father",
    "country_of_birth_mother",
    "country_of_birth_self",
    "citizenship",
    "own_business_or_self_employed",
    "fill_inc_questionnaire_for_veteran's_admin",
    "veterans_benefits",
    "year",
]

In [5]:
'''
target label 을 [" - 50000.", " 50000+."] 에서 [0, 1] 로 변경함
Categorical 항목은 str 타입으로 변경: 1 -> '1'
'''
def prepare_dataframe(dataframe):
    # convert the target labels from string to integer
    try:
        dataframe[TARGET_COLUMN_NAME] = dataframe[TARGET_COLUMN_NAME].map(TARGET_LABELS.index)
    except:
        pass 
    
    # Cast the categorical features to string
    for feature_name in CATEGORICAL_FEATURE_NAMES:
        dataframe[feature_name] = dataframe[feature_name].astype(str)

# 변경 전
print(test_data[TARGET_COLUMN_NAME])

prepare_dataframe(train_data)
prepare_dataframe(test_data)

# 변경 후
print(test_data[TARGET_COLUMN_NAME])
print(test_data["detailed_industry_recode"])

0         - 50000.
1         - 50000.
2         - 50000.
3         - 50000.
4         - 50000.
           ...    
99757     - 50000.
99758     - 50000.
99759     - 50000.
99760     - 50000.
99761     - 50000.
Name: income_level, Length: 99762, dtype: object
0        0
1        0
2        0
3        0
4        0
        ..
99757    0
99758    0
99759    0
99760    0
99761    0
Name: income_level, Length: 99762, dtype: int64
0         6
1        37
2         0
3        29
4         4
         ..
99757     0
99758     8
99759     1
99760    45
99761     0
Name: detailed_industry_recode, Length: 99762, dtype: object


In [6]:
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(train_data.head().T)

Train data shape: (199523, 42)
Test data shape: (99762, 42)
                                                                                    0  \
age                                                                                73   
class_of_worker                                                       Not in universe   
detailed_industry_recode                                                            0   
detailed_occupation_recode                                                          0   
education                                                        High school graduate   
wage_per_hour                                                                       0   
enroll_in_edu_inst_last_wk                                            Not in universe   
marital_stat                                                                  Widowed   
major_industry_code                                       Not in universe or children   
major_occupation_code                             

### Configure hyperparameters

In [7]:
'''
GBM 의 주요 하이퍼파라미터들
'''
# Maximum number of decision trees. The effective number of trained trees can be smaller if early stopping is enabled.
NUM_TREES = 250
# Minimum number of examples in a node.
MIN_EXAMPLES = 6
# Maximum depth of the tree. max_depth=1 means that all trees will be roots.
MAX_DEPTH = 5
# Ratio of the dataset (sampling without replacement) used to train individual trees for the random sampling method.
SUBSAMPLE = 0.65
# Control the sampling of the datasets used to train individual trees.
SAMPLING_METHOD = "RANDOM"
# Ratio of the training dataset used to monitor the training. Require to be >0 if early stopping is enabled.
VALIDATION_RATIO = 0.1

### Implement a training and evaluation procedure

In [8]:
def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):
    '''
    모델을 훈련시키고 평가, 3가지 실험에서 모두 사용됨.
    pandas dataframe 에서 tf.data.Dataset 생성
    모델 훈련(model.fit)
    테스트셋으로 모델 평가
    '''
    # (inputs, targets, sample_weights)
    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        train_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
    )
    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        test_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
    )

    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
    _, accuracy = model.evaluate(test_dataset, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

# Experiment 1: Decision Forests with raw features

###  Specify model input feature usages

In [9]:
def specify_feature_usages():
    '''
    feature 의 이름과 특성(numerical or categorical) 지정 -> tfdf.keras.FeatureUsage
    GBM 모델 생성시 인자로 전달
    '''
    feature_usages = []

    for feature_name in NUMERIC_FEATURE_NAMES:
        feature_usage = tfdf.keras.FeatureUsage(
            name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
        )
        feature_usages.append(feature_usage)

    for feature_name in CATEGORICAL_FEATURE_NAMES:
        feature_usage = tfdf.keras.FeatureUsage(
            name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
        )
        feature_usages.append(feature_usage)

    return feature_usages

### Create a Gradient Boosted Trees Model

In [10]:
def create_gbt_model():
    # See all the model parameters in https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/GradientBoostedTreesModel
    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        features=specify_feature_usages(), # 입력 피쳐 지정
        exclude_non_specified_features=True, 
        num_trees=NUM_TREES, # 트리 개수
        max_depth=MAX_DEPTH, # 트리 최대 깊이
        min_examples=MIN_EXAMPLES, # 노드에 들어갈 최소 샘플 개수, 그 이하일때는 분기하지 않음
        subsample=SUBSAMPLE, # 개별 트리를 훈련하는데 사용하는 데이터셋의 비율
        validation_ratio=VALIDATION_RATIO, # 훈련셋중에 모델 검증을 위해 사용할 데이터 비율, 
        task=tfdf.keras.Task.CLASSIFICATION,
    )

    gbt_model.compile(weighted_metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
    return gbt_model

### Train and evaluate the model

In [11]:
# 모델 생성
gbt_model = create_gbt_model()

# 훈련 및 평가 -> 95.8%
run_experiment(gbt_model, train_data, test_data)

Use /tmp/tmpmndwy_mm as temporary training directory


2022-10-24 12:41:04.874891: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-10-24 12:41:04.874934: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: BigBoy
2022-10-24 12:41:04.874942: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: BigBoy
2022-10-24 12:41:04.875070: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.73.5
2022-10-24 12:41:04.875099: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.73.5
2022-10-24 12:41:04.875106: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 510.73.5
2022-10-24 12:41:04.876573: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in perf





Reading training dataset...
Training dataset read in 0:00:08.103885. Found 199523 examples.
Training model...
Model trained in 0:00:31.063814
Compiling model...


[INFO kernel.cc:1176] Loading model from path /tmp/tmpmndwy_mm/model/ with prefix 9159fb4463da486c
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
Test accuracy: 95.8%


### Inspect the model

In [12]:
print(gbt_model.summary())

Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (40):
	age
	capital_gains
	capital_losses
	citizenship
	class_of_worker
	country_of_birth_father
	country_of_birth_mother
	country_of_birth_self
	detailed_household_and_family_stat
	detailed_household_summary_in_household
	detailed_industry_recode
	detailed_occupation_recode
	dividends_from_stocks
	education
	enroll_in_edu_inst_last_wk
	family_members_under_18
	fill_inc_questionnaire_for_veteran's_admin
	full_or_part_time_employment_stat
	hispanic_origin
	live_in_this_house_1_year_ago
	major_industry_code
	major_occupation_code
	marital_stat
	member_of_a_labor_union
	migration_code-change_in_msa
	migration_code-

# Experiment 2: Decision Forests with target encoding

### Implement Binary Target Encoder

In [13]:
'''
Target 이 아니라 Categorical feature 를 인코딩함
feature value 에 대한 (positive_frequency, negative_frequency, positive_probability) 로 인코딩
positive_probability = pos_freq / (pos_freq + neg_freq + correction)
'''

class BinaryTargetEncoding(layers.Layer):
    def __init__(self, vocabulary_size=None, correction=1.0, **kwargs):
        super().__init__(**kwargs)
        self.vocabulary_size = vocabulary_size
        self.correction = correction
        
    def adapt(self, data):
        '''
        data is expected to be an integer numpy array to a Tensor shape [num_examples, 2].
        This contains feature falues for a given feature in the dataset, and target values.
        '''
        # convert the data to a tensor
        data = tf.convert_to_tensor(data)
        # Separate the feature values and target values
        feature_values = tf.cast(data[:,0], tf.dtypes.int32)
        target_values = tf.cast(data[:,1], tf.dtypes.bool)
        
        # Compute the vocabulary_size of not specified
        if self.vocabulary_size is None:
            self.vocabulary_size = tf.unique(feature_values).y.shape[0]
            
        # Filter the data where the target label is positive.
        positive_indices = tf.where(condition=target_values)
        positive_feature_values = tf.gather_nd(params=feature_values, indices=positive_indices)
        
        # Compute how many times each feature value occurred with a positive target label.
        positive_frequency = tf.math.unsorted_segment_sum(
            data = tf.ones(shape=(positive_feature_values.shape[0], 1)),
            segment_ids=positive_feature_values,
            num_segments=self.vocabulary_size,
        )
                         
        # Filter the data where the target label is negative.
        negative_indices = tf.where(condition=tf.math.logical_not(target_values))
        negative_feature_values = tf.gather_nd(
            params=feature_values, indices=negative_indices
        )
        # Compute how many times each feature value occurred with a negative target label.
        negative_frequency = tf.math.unsorted_segment_sum(
            data=tf.ones(
                shape=(negative_feature_values.shape[0], 1)),
            segment_ids=negative_feature_values,
            num_segments=self.vocabulary_size,
        )
        # Compute positive probability for the input feature values.
        positive_probability = positive_frequency / (
            positive_frequency + negative_frequency + self.correction
        )
        # Concatenate the computed statistics for traget_encoding.
        target_encoding_statistics = tf.cast(
            tf.concat(
                [positive_frequency, negative_frequency, positive_probability], axis=1
            ),
            dtype=tf.dtypes.float32,
        )
        self.target_encoding_statistics = tf.constant(target_encoding_statistics)
        print('** target_encoding_statics **\n', self.target_encoding_statistics)
        
    def call(self, inputs):
        '''
        inputs is expected to be an integer numpy array to a Tensor shape [num_examples, 1].
        This includes the feature values for a given feature in the dataset.
        '''
        if self.target_encoding_statistics == None:
            raise ValueError(f'You need to call the adapt method to compute target encoding statistics')
            
        # Convert the inputs to a tensor
        inputs = tf.convert_to_tensor(inputs)
        # Cast the inputs int64 a tensor
        inputs = tf.cast(inputs, tf.dtypes.int64)
        # Lookup target encoding statistics for the input feature values.
        target_encoding_statistics = tf.cast(
            tf.gather_nd(self.target_encoding_statistics, inputs), 
            dtype=tf.dtypes.float32)
        return target_encoding_statistics

In [14]:
data = tf.constant(
    [
        [0, 1],
        [2, 0],
        [0, 1],
        [1, 1],
        [1, 1],
        [2, 0],
        [1, 0],
        [0, 1],
        [2, 1],
        [1, 0],
        [0, 1],
        [2, 0],
        [0, 1],
        [1, 1],
        [1, 1],
        [2, 0],
        [1, 0],
        [0, 1],
        [2, 0],
    ]
)

binary_target_encoder = BinaryTargetEncoding()
binary_target_encoder.adapt(data)
print(binary_target_encoder([[0], [1], [2]]))

** target_encoding_statics **
 tf.Tensor(
[[6.         0.         0.85714287]
 [4.         3.         0.5       ]
 [1.         5.         0.14285715]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[6.         0.         0.85714287]
 [4.         3.         0.5       ]
 [1.         5.         0.14285715]], shape=(3, 3), dtype=float32)


### Create model inputs

In [15]:
'''
target_encoder 에 대한 keras Input layer
'''
def create_model_inputs():
    inputs = {}    
    for feature_name in NUMERIC_FEATURE_NAMES:
        inputs[feature_name] = layers.Input(name=feature_name, shape=(), dtype=tf.float32)
    for feature_name in CATEGORICAL_FEATURE_NAMES:
        inputs[feature_name] = layers.Input(name=feature_name, shape=(), dtype=tf.string)
    return inputs

In [16]:
'''
BinaryTargetEncodeing 을 이용해서 feature encoding 하는 keras Model 생성
'''

def create_target_encoder():
    inputs = create_model_inputs()
    target_values = train_data[[TARGET_COLUMN_NAME]].to_numpy()
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            # Get the vocabulary of the categorical feature. Sorted list of unique values
            vocabulary = sorted(
                [str(value) for value in list(train_data[feature_name].unique())]
            )
            print(feature_name, 'vocabulary', vocabulary)
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            print('lookup', lookup)
            # Convert the string input values into integer indices.
            value_indices = lookup(inputs[feature_name])
            # Prepare the data to adapt the target encoding.
            print("### Adapting target encoding for:", feature_name)
            feature_values = train_data[[feature_name]].to_numpy().astype(str)
            feature_value_indices = lookup(feature_values)
            data = tf.concat([feature_value_indices, target_values], axis=1)
            feature_encoder = BinaryTargetEncoding()
            feature_encoder.adapt(data)
            # Convert the feature value indices to target encoding representations.
            encoded_feature = feature_encoder(tf.expand_dims(value_indices, -1))
        else:
            # Expand the dimensions of the numerical input feature and use it as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
        # Add the encoded feature to the list.
        encoded_features.append(encoded_feature)
    # Concatenate all the encoded features.
    encoded_features = tf.concat(encoded_features, axis=1)
    # Create and return a Keras model with encoded features as outputs.
    return keras.Model(inputs=inputs, outputs=encoded_features)

In [17]:
def create_gbt_with_preprocessor(preprocessor):

    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        preprocessing=preprocessor,  # Categorical feature 는 BinaryTargetEncoding 된 입력을 사용함
        num_trees=NUM_TREES,
        max_depth=MAX_DEPTH,
        min_examples=MIN_EXAMPLES,
        subsample=SUBSAMPLE,
        validation_ratio=VALIDATION_RATIO,
        task=tfdf.keras.Task.CLASSIFICATION,
    )

    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])

    return gbt_model

In [18]:
gbt_model = create_gbt_with_preprocessor(create_target_encoder())
run_experiment(gbt_model, train_data, test_data)

class_of_worker vocabulary [' Federal government', ' Local government', ' Never worked', ' Not in universe', ' Private', ' Self-employed-incorporated', ' Self-employed-not incorporated', ' State government', ' Without pay']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc1bec70>
### Adapting target encoding for: class_of_worker
** target_encoding_statics **
 tf.Tensor(
[[5.9700000e+02 2.3280000e+03 2.0403281e-01]
 [8.4700000e+02 6.9370000e+03 1.0879897e-01]
 [2.0000000e+00 4.3700000e+02 4.5454544e-03]
 [9.0400000e+02 9.9341000e+04 9.0178158e-03]
 [7.3220000e+03 6.4706000e+04 1.0165350e-01]
 [1.1340000e+03 2.1310000e+03 3.4721372e-01]
 [1.0900000e+03 7.3550000e+03 1.2905517e-01]
 [4.8500000e+02 3.7420000e+03 1.1471145e-01]
 [1.0000000e+00 1.6400000e+02 6.0240962e-03]], shape=(9, 3), dtype=float32)
detailed_industry_recode vocabulary ['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', 

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc155850>
### Adapting target encoding for: detailed_industry_recode
** target_encoding_statics **
 tf.Tensor(
[[9.0600000e+02 9.9778000e+04 8.9983614e-03]
 [4.7000000e+01 7.8000000e+02 5.6763284e-02]
 [0.0000000e+00 4.0000000e+00 0.0000000e+00]
 [3.2400000e+02 1.4400000e+03 1.8356940e-01]
 [2.8100000e+02 1.0690000e+03 2.0799407e-01]
 [2.1200000e+02 6.8700000e+02 2.3555556e-01]
 [6.9000000e+01 2.2600000e+02 2.3310810e-01]
 [1.1500000e+02 3.3700000e+02 2.5386313e-01]
 [1.2100000e+02 4.1800000e+02 2.2407408e-01]
 [1.2000000e+01 1.4500000e+02 7.5949363e-02]
 [3.7000000e+01 4.4600000e+02 7.6446280e-02]
 [9.3000000e+01 1.2530000e+03 6.9042318e-02]
 [1.3100000e+02 2.0650000e+03 5.9626766e-02]
 [1.5000000e+01 1.7000000e+01 4.5454547e-01]
 [2.7000000e+01 5.3200000e+02 4.8214287e-02]
 [3.9000000e+01 9.1300000e+02 4.0923398e-02]
 [9.2000000e+01 4.3300000e+02 1.7490494e-01]
 [1.8000000e+02 1.3230000e+03 1.1968085e-01]
 

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


** target_encoding_statics **
 tf.Tensor(
[[6.20000000e+01 7.49500000e+03 8.20322800e-03]
 [7.00000000e+01 6.80600000e+03 1.01788575e-02]
 [3.40000000e+01 2.09200000e+03 1.59849562e-02]
 [1.30000000e+01 1.78600000e+03 7.22222216e-03]
 [2.20000000e+01 3.25500000e+03 6.71140943e-03]
 [7.20000000e+01 7.93500000e+03 8.99100862e-03]
 [3.80000000e+01 6.19200000e+03 6.09853957e-03]
 [4.12000000e+02 3.95100000e+03 9.44088027e-02]
 [4.13000000e+02 4.94500000e+03 7.70666152e-02]
 [3.91500000e+03 1.59500000e+04 1.97070375e-01]
 [0.00000000e+00 4.74220000e+04 0.00000000e+00]
 [6.57000000e+02 6.06000000e+02 5.19778490e-01]
 [1.87900000e+03 4.65280000e+04 3.88158970e-02]
 [1.00000000e+00 8.18000000e+02 1.21951220e-03]
 [2.03800000e+03 4.50300000e+03 3.11525524e-01]
 [9.69000000e+02 8.24000000e+02 5.40133774e-01]
 [1.78700000e+03 2.60330000e+04 6.42320514e-02]], shape=(17, 3), dtype=float32)
enroll_in_edu_inst_last_wk vocabulary [' College or university', ' High school', ' Not in universe']
lookup <k

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


major_industry_code vocabulary [' Agriculture', ' Armed Forces', ' Business and repair services', ' Communications', ' Construction', ' Education', ' Entertainment', ' Finance insurance and real estate', ' Forestry and fisheries', ' Hospital services', ' Manufacturing-durable goods', ' Manufacturing-nondurable goods', ' Medical except hospital', ' Mining', ' Not in universe or children', ' Other professional services', ' Personal services except private HH', ' Private household services', ' Public administration', ' Retail trade', ' Social services', ' Transportation', ' Utilities and sanitary services', ' Wholesale trade']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc0f1340>
### Adapting target encoding for: major_industry_code
** target_encoding_statics **
 tf.Tensor(
[[1.7800000e+02 2.8450000e+03 5.8862433e-02]
 [8.0000000e+00 2.8000000e+01 2.1621622e-01]
 [6.1900000e+02 5.0320000e+03 1.0951875e-01]
 [2.7000000e+02 9.1100000e+02 2.2842640e-01]
 [5.

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


** target_encoding_statics **
 tf.Tensor(
[[4.4600000e+02 1.4391000e+04 3.0057959e-02]
 [8.0000000e+00 2.8000000e+01 2.1621622e-01]
 [3.5930000e+03 8.9020000e+03 2.8753200e-01]
 [1.6000000e+02 2.9860000e+03 5.0842073e-02]
 [7.9000000e+01 4.0480000e+03 1.9137597e-02]
 [2.3500000e+02 6.1440000e+03 3.6833856e-02]
 [9.0600000e+02 9.9778000e+04 8.9983614e-03]
 [1.2000000e+02 1.1979000e+04 9.9173551e-03]
 [9.5500000e+02 9.5630000e+03 9.0788096e-02]
 [2.0000000e+00 7.7800000e+02 2.5608195e-03]
 [3.4750000e+03 1.0465000e+04 2.4926476e-01]
 [2.5900000e+02 1.4020000e+03 1.5583634e-01]
 [1.5240000e+03 1.0259000e+04 1.2932791e-01]
 [3.6100000e+02 2.6570000e+03 1.1957602e-01]
 [2.5900000e+02 3.7610000e+03 6.4411841e-02]], shape=(15, 3), dtype=float32)
race vocabulary [' Amer Indian Aleut or Eskimo', ' Asian or Pacific Islander', ' Black', ' Other', ' White']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2ecc59c280>
### Adapting target encoding for: race
** target_encod

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


** target_encoding_statics **
 tf.Tensor(
[[2.4000000e+01 5.7400000e+02 4.0066779e-02]
 [2.2000000e+01 9.5400000e+02 2.2517912e-02]
 [2.0000000e+00 4.3700000e+02 4.5454544e-03]
 [1.2212000e+04 1.8124100e+05 6.3126117e-02]
 [9.9000000e+01 1.9390000e+03 4.8553213e-02]
 [2.3000000e+01 1.9960000e+03 1.1386138e-02]], shape=(6, 3), dtype=float32)
full_or_part_time_employment_stat vocabulary [' Children or Armed Forces', ' Full-time schedules', ' Not in labor force', ' PT for econ reasons usually FT', ' PT for econ reasons usually PT', ' PT for non-econ reasons usually FT', ' Unemployed full-time', ' Unemployed part- time']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2ecc5e7df0>
### Adapting target encoding for: full_or_part_time_employment_stat
** target_encoding_statics **
 tf.Tensor(
[[5.8740000e+03 1.1789500e+05 4.7458995e-02]
 [5.3660000e+03 3.5370000e+04 1.3172300e-01]
 [4.6200000e+02 2.6346000e+04 1.7233018e-02]
 [3.1000000e+01 4.9400000e+02 5.8935363e-0

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


region_of_previous_residence vocabulary [' Abroad', ' Midwest', ' Northeast', ' Not in universe', ' South', ' West']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc229eb0>
### Adapting target encoding for: region_of_previous_residence
** target_encoding_statics **
 tf.Tensor(
[[1.8000000e+01 5.1200000e+02 3.3898305e-02]
 [1.0900000e+02 3.4660000e+03 3.0480985e-02]
 [1.4700000e+02 2.5580000e+03 5.4323725e-02]
 [1.1764000e+04 1.7198600e+05 6.4021423e-02]
 [1.8400000e+02 4.7050000e+03 3.7627812e-02]
 [1.6000000e+02 3.9140000e+03 3.9263804e-02]], shape=(6, 3), dtype=float32)
state_of_previous_residence vocabulary [' ?', ' Abroad', ' Alabama', ' Alaska', ' Arizona', ' Arkansas', ' California', ' Colorado', ' Connecticut', ' Delaware', ' District of Columbia', ' Florida', ' Georgia', ' Idaho', ' Illinois', ' Indiana', ' Iowa', ' Kansas', ' Kentucky', ' Louisiana', ' Maine', ' Maryland', ' Massachusetts', ' Michigan', ' Minnesota', ' Mississippi', ' Missouri',

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


** target_encoding_statics **
 tf.Tensor(
[[2.3000000e+01 9.9000000e+02 2.2682445e-02]
 [1.2000000e+01 6.5900000e+02 1.7857144e-02]
 [8.7000000e+01 1.1943000e+04 7.2313193e-03]
 [2.0000000e+00 5.8700000e+02 3.3898305e-03]
 [2.0000000e+00 1.2400000e+02 1.5748031e-02]
 [0.0000000e+00 9.0000000e+00 0.0000000e+00]
 [0.0000000e+00 3.6000000e+01 0.0000000e+00]
 [0.0000000e+00 8.0000000e+01 0.0000000e+00]
 [2.0000000e+00 5.0324000e+04 3.9740098e-05]
 [0.0000000e+00 2.0000000e+00 0.0000000e+00]
 [0.0000000e+00 7.3200000e+02 0.0000000e+00]
 [0.0000000e+00 9.0000000e+00 0.0000000e+00]
 [1.0000000e+00 3.3000000e+01 2.8571429e-02]
 [0.0000000e+00 6.0000000e+00 0.0000000e+00]
 [2.0000000e+00 3.7300000e+02 5.3191488e-03]
 [0.0000000e+00 1.0000000e+01 0.0000000e+00]
 [0.0000000e+00 2.0000000e+00 0.0000000e+00]
 [0.0000000e+00 2.0000000e+00 0.0000000e+00]
 [0.0000000e+00 1.8680000e+03 0.0000000e+00]
 [0.0000000e+00 1.0660000e+03 0.0000000e+00]
 [7.8500000e+03 4.5398000e+04 1.4742061e-01]
 [2.0000000e+

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


migration_code-change_in_reg vocabulary [' ?', ' Abroad', ' Different county same state', ' Different division same region', ' Different region', ' Different state same division', ' Nonmover', ' Not in universe', ' Same county']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2ecc71f700>
### Adapting target encoding for: migration_code-change_in_reg
** target_encoding_statics **
 tf.Tensor(
[[6.5430000e+03 9.3153000e+04 6.5628856e-02]
 [1.8000000e+01 5.1200000e+02 3.3898305e-02]
 [1.2700000e+02 2.6700000e+03 4.5389563e-02]
 [2.5000000e+01 4.4000000e+02 5.3648070e-02]
 [7.5000000e+01 1.1030000e+03 6.3613228e-02]
 [4.5000000e+01 9.4600000e+02 4.5362905e-02]
 [5.2210000e+03 7.7317000e+04 6.3254945e-02]
 [0.0000000e+00 1.5160000e+03 0.0000000e+00]
 [3.2800000e+02 9.4840000e+03 3.3425048e-02]], shape=(9, 3), dtype=float32)
migration_code-move_within_reg vocabulary [' ?', ' Abroad', ' Different county same state', ' Different state in Midwest', ' Different state i

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


### Adapting target encoding for: live_in_this_house_1_year_ago
** target_encoding_statics **
 tf.Tensor(
[[6.1800000e+02 1.5155000e+04 3.9178394e-02]
 [6.5430000e+03 9.4669000e+04 6.4645849e-02]
 [5.2210000e+03 7.7317000e+04 6.3254945e-02]], shape=(3, 3), dtype=float32)
migration_prev_res_in_sunbelt vocabulary [' ?', ' No', ' Not in universe', ' Yes']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc245340>
### Adapting target encoding for: migration_prev_res_in_sunbelt
** target_encoding_statics **
 tf.Tensor(
[[6.5430000e+03 9.3153000e+04 6.5628856e-02]
 [4.2400000e+02 9.5630000e+03 4.2450942e-02]
 [5.2210000e+03 7.8833000e+04 6.2114093e-02]
 [1.9400000e+02 5.5920000e+03 3.3523414e-02]], shape=(4, 3), dtype=float32)
family_members_under_18 vocabulary [' Both parents present', ' Father only present', ' Mother only present', ' Neither parent present', ' Not in universe']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2ecc451a

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc0af400>
### Adapting target encoding for: country_of_birth_self
** target_encoding_statics **
 tf.Tensor(
[[2.9700000e+02 3.0960000e+03 8.7507367e-02]
 [1.0000000e+00 9.4000000e+01 1.0416667e-02]
 [6.9000000e+01 6.3100000e+02 9.8430812e-02]
 [3.5000000e+01 4.4300000e+02 7.3068894e-02]
 [9.0000000e+00 4.2500000e+02 2.0689655e-02]
 [6.0000000e+01 7.7700000e+02 7.1599044e-02]
 [1.1000000e+01 6.7900000e+02 1.5918959e-02]
 [9.0000000e+00 2.4900000e+02 3.4749035e-02]
 [1.3000000e+01 6.7600000e+02 1.8840579e-02]
 [5.8000000e+01 3.9900000e+02 1.2663755e-01]
 [1.5000000e+01 1.0600000e+02 1.2295082e-01]
 [7.2000000e+01 7.7900000e+02 8.4507041e-02]
 [1.5000000e+01 1.3200000e+02 1.0135135e-01]
 [3.0000000e+00 3.4100000e+02 8.6956518e-03]
 [8.0000000e+00 2.2000000e+02 3.4934498e-02]
 [5.0000000e+00 1.8000000e+01 2.0833333e-01]
 [2.0000000e+00 1.4200000e+02 1.3793103e-02]
 [1.0000000e+01 9.0000000e+01 9.9009901e-02]
 [8.

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())


own_business_or_self_employed vocabulary ['0', '1', '2']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2edc15b520>
### Adapting target encoding for: own_business_or_self_employed
** target_encoding_statics **
 tf.Tensor(
[[1.0452000e+04 1.7022000e+05 5.7850372e-02]
 [6.0900000e+02 2.0890000e+03 2.2563912e-01]
 [1.3210000e+03 1.4832000e+04 8.1775412e-02]], shape=(3, 3), dtype=float32)
fill_inc_questionnaire_for_veteran's_admin vocabulary [' No', ' Not in universe', ' Yes']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f2ecc48f6a0>
### Adapting target encoding for: fill_inc_questionnaire_for_veteran's_admin
** target_encoding_statics **
 tf.Tensor(
[[2.1500000e+02 1.3780000e+03 1.3488080e-01]
 [1.2151000e+04 1.8538800e+05 6.1511591e-02]
 [1.6000000e+01 3.7500000e+02 4.0816326e-02]], shape=(3, 3), dtype=float32)
veterans_benefits vocabulary ['0', '1', '2']
lookup <keras.layers.preprocessing.string_lookup.StringLookup object at 0x7

  return bool(asarray(a1 == a2).all())
  return bool(asarray(a1 == a2).all())
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)






Reading training dataset...
Training dataset read in 0:00:06.694637. Found 199523 examples.
Training model...
Model trained in 0:02:59.516634
Compiling model...


[INFO kernel.cc:1176] Loading model from path /tmp/tmpcse795sf/model/ with prefix d6785a542e5b49ad
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine


Model compiled.




Test accuracy: 95.81%


# Experiment 3: Decision Forests with trained embeddings

In [19]:
'''
Categorical feature 를 임베딩
feature(str) --> value_index --> embedding 
'''

def create_embedding_encoder(size=None, hidden_layers=1):
    inputs = create_model_inputs()
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            # Get the vocabulary of the categorical feature.
            vocabulary = sorted(
                [str(value) for value in list(train_data[feature_name].unique())]
            )
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            # Convert the string input values into integer indices.
            value_index = lookup(inputs[feature_name])
            # Create an embedding layer with the specified dimensions
            vocabulary_size = len(vocabulary)
            embedding_size = int(math.sqrt(vocabulary_size))
            feature_encoder = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_size
            )
            # Convert the index values to embedding representations.
            encoded_feature = feature_encoder(value_index)
        else:
            # Expand the dimensions of the numerical input feature and use it as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
        # Add the encoded feature to the list.
        encoded_features.append(encoded_feature)
    # Concatenate all the encoded features.
    encoded_features = layers.concatenate(encoded_features, axis=1)
    for _ in range(hidden_layers):
        # Apply dropout.
        encoded_features = layers.Dropout(rate=0.25)(encoded_features)
        # Perform non-linearity projection.
        encoded_features = layers.Dense(
            units=size if size else encoded_features.shape[-1], activation="gelu"
        )(encoded_features)
    # Create and return a Keras model with encoded features as outputs.
    return keras.Model(inputs=inputs, outputs=encoded_features)

In [20]:
'''
embedding 을 훈련시키기 위한 간단한 Neural Net Classifier
'''

def create_nn_model(encoder):
    inputs = create_model_inputs()
    embeddings = encoder(inputs)
    output = layers.Dense(units=1, activation="sigmoid")(embeddings)

    nn_model = keras.Model(inputs=inputs, outputs=output)
    nn_model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy("accuracy")],
    )
    return nn_model


embedding_encoder = create_embedding_encoder(size=64, hidden_layers=1)
run_experiment(
    create_nn_model(embedding_encoder),
    train_data,
    test_data,
    num_epochs=5,
    batch_size=256,
)

  return bool(asarray(a1 == a2).all())
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Test accuracy: 94.94%


In [21]:
gbt_model = create_gbt_with_preprocessor(embedding_encoder)
run_experiment(gbt_model, train_data, test_data)

Use /tmp/tmpp4uiw7ix as temporary training directory


  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)






Reading training dataset...
Training dataset read in 0:00:06.206115. Found 199523 examples.
Training model...
Model trained in 0:02:52.849531
Compiling model...


[INFO kernel.cc:1176] Loading model from path /tmp/tmpp4uiw7ix/model/ with prefix 94e578a55c5a4584
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine


Model compiled.




Test accuracy: 95.32%


## Better Neural Net works?

In [22]:
'''
hidden layer 추가
'''
embedding_encoder = create_embedding_encoder(size=64, hidden_layers=2)
run_experiment(
    create_nn_model(embedding_encoder),
    train_data,
    test_data,
    num_epochs=5,
    batch_size=256,
)

  return bool(asarray(a1 == a2).all())
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Test accuracy: 94.29%


In [23]:
gbt_model = create_gbt_with_preprocessor(embedding_encoder)
run_experiment(gbt_model, train_data, test_data)

Use /tmp/tmpfm95cg00 as temporary training directory


  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)
  features_dataframe = dataframe.drop(label, 1)
  features_dataframe = features_dataframe.drop(weight, 1)






Reading training dataset...
Training dataset read in 0:00:05.927508. Found 199523 examples.
Training model...
Model trained in 0:02:30.253590
Compiling model...


[INFO kernel.cc:1176] Loading model from path /tmp/tmpfm95cg00/model/ with prefix a9cbca79e8ce43c3
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine


Model compiled.




Test accuracy: 95.28%
