# Data sampling and feature engineering

In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import PolynomialFeatures

In [26]:
# Immediate execution
tf.enable_eager_execution()

In [27]:
# Source dataframe
df = pd.DataFrame(
    data={
        'feature1': np.random.rand(10),
        'feature2': np.random.rand(10),
        'feature3': np.random.rand(10),
    }
)

df

Unnamed: 0,feature1,feature2,feature3
0,0.827965,0.231482,0.266384
1,0.788115,0.019544,0.950652
2,0.632887,0.999653,0.958807
3,0.248602,0.791476,0.380484
4,0.035784,0.571207,0.721888
5,0.694578,0.264295,0.688943
6,0.357552,0.381085,0.85805
7,0.560881,0.572887,0.017695
8,0.855202,0.770541,0.256315
9,0.620974,0.48149,0.096609


In [28]:
# prepare moving average
df['feature1_rolling'] = df['feature1'].rolling(2, min_periods=1).mean()
df

Unnamed: 0,feature1,feature2,feature3,feature1_rolling
0,0.827965,0.231482,0.266384,0.827965
1,0.788115,0.019544,0.950652,0.80804
2,0.632887,0.999653,0.958807,0.710501
3,0.248602,0.791476,0.380484,0.440744
4,0.035784,0.571207,0.721888,0.142193
5,0.694578,0.264295,0.688943,0.365181
6,0.357552,0.381085,0.85805,0.526065
7,0.560881,0.572887,0.017695,0.459216
8,0.855202,0.770541,0.256315,0.708042
9,0.620974,0.48149,0.096609,0.738088


In [29]:
# Target variable
df['target'] = np.random.randint(0, 3, 10)
df

Unnamed: 0,feature1,feature2,feature3,feature1_rolling,target
0,0.827965,0.231482,0.266384,0.827965,0
1,0.788115,0.019544,0.950652,0.80804,1
2,0.632887,0.999653,0.958807,0.710501,0
3,0.248602,0.791476,0.380484,0.440744,2
4,0.035784,0.571207,0.721888,0.142193,0
5,0.694578,0.264295,0.688943,0.365181,0
6,0.357552,0.381085,0.85805,0.526065,1
7,0.560881,0.572887,0.017695,0.459216,2
8,0.855202,0.770541,0.256315,0.708042,2
9,0.620974,0.48149,0.096609,0.738088,0


In [30]:
# Convert to TF dataset
features = ['feature1', 'feature2', 'feature3', 'feature1_rolling']

training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(df[features].values, tf.float32),
            tf.cast(df['target'].values, tf.int32)
        )
    )
)

training_dataset

<DatasetV1Adapter shapes: ((4,), ()), types: (tf.float32, tf.int32)>

In [31]:
# Output
iterator = training_dataset.make_one_shot_iterator()
for x, y in iterator:
    print(x, y)

tf.Tensor([0.8279646  0.23148249 0.26638433 0.8279646 ], shape=(4,), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor([0.7881145  0.01954356 0.95065194 0.80803955], shape=(4,), dtype=float32) tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([0.6328872  0.99965334 0.95880693 0.71050084], shape=(4,), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor([0.24860156 0.7914758  0.38048422 0.44074437], shape=(4,), dtype=float32) tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor([0.03578405 0.5712067  0.7218877  0.14219281], shape=(4,), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor([0.69457835 0.26429522 0.6889429  0.3651812 ], shape=(4,), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor([0.35755193 0.3810849  0.8580497  0.5260651 ], shape=(4,), dtype=float32) tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([0.560881   0.5728865  0.01769461 0.45921648], shape=(4,), dtype=float32) tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor([0.8552021  0.77054065 0.25631

In [32]:
# Simple feature column definition
simple_feature_column = tf.feature_column.numeric_column('feature1')
feature1 = {'feature1': tf.convert_to_tensor(df['feature1'])}
net = tf.feature_column.input_layer(feature1, simple_feature_column)
print(net)

tf.Tensor(
[[0.8279646 ]
 [0.7881145 ]
 [0.6328872 ]
 [0.24860156]
 [0.03578405]
 [0.69457835]
 [0.35755193]
 [0.560881  ]
 [0.8552021 ]
 [0.6209743 ]], shape=(10, 1), dtype=float32)


In [33]:
# Bucketing
bucketized_feature_column = tf.feature_column.bucketized_column(
    source_column = simple_feature_column,
    boundaries = [0.2, 0.4, 0.6, 0.8])
print(tf.feature_column.input_layer(feature1, bucketized_feature_column))

tf.Tensor(
[[0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]], shape=(10, 5), dtype=float32)


In [34]:
# Scaling
val_mean = df['feature1'].mean()
val_std = df['feature1'].std()

def scaler(x):
    return (tf.cast(x, dtype=tf.float32) - val_mean) / val_std

scaled_feature_column = tf.feature_column.numeric_column('feature1', normalizer_fn=scaler)
net = tf.feature_column.input_layer(feature1, scaled_feature_column)
print(net)

tf.Tensor(
[[ 0.988087  ]
 [ 0.83989805]
 [ 0.26266083]
 [-1.166366  ]
 [-1.9577614 ]
 [ 0.49206915]
 [-0.76121676]
 [-0.00510547]
 [ 1.0893738 ]
 [ 0.2183609 ]], shape=(10, 1), dtype=float32)


In [35]:
# Log transformation (as normalization)
def transformer(x):
    return tf.cast(tf.log(x), dtype=tf.float32)

log_feature_column = tf.feature_column.numeric_column('feature1', normalizer_fn=transformer)
net = tf.feature_column.input_layer(feature1, log_feature_column)
print(net)

tf.Tensor(
[[-0.1887849 ]
 [-0.23811188]
 [-0.4574631 ]
 [-1.3919039 ]
 [-3.330253  ]
 [-0.3644503 ]
 [-1.0284747 ]
 [-0.5782464 ]
 [-0.15641752]
 [-0.4764656 ]], shape=(10, 1), dtype=float32)


In [36]:
# Polynomial features
poly = PolynomialFeatures(2)

poly_feature_column0 = tf.feature_column.numeric_column('feature0')
poly_feature_column1 = tf.feature_column.numeric_column('feature1')
poly_feature_column2 = tf.feature_column.numeric_column('feature2')
poly_features = poly.fit_transform(df['feature1'].values.reshape(-1, 1))
poly_feature = {
    'feature0': tf.convert_to_tensor(poly_features[:, 0]),
    'feature1': tf.convert_to_tensor(poly_features[:, 1]),
    'feature2': tf.convert_to_tensor(poly_features[:, 2]),
}
net = tf.feature_column.input_layer(poly_feature, [poly_feature_column0, poly_feature_column1, poly_feature_column2])
print(net)

tf.Tensor(
[[1.         0.8279646  0.68552536]
 [1.         0.7881145  0.6211245 ]
 [1.         0.6328872  0.4005462 ]
 [1.         0.24860156 0.06180274]
 [1.         0.03578405 0.0012805 ]
 [1.         0.69457835 0.48243907]
 [1.         0.35755193 0.12784338]
 [1.         0.560881   0.31458756]
 [1.         0.8552021  0.73137057]
 [1.         0.6209743  0.38560906]], shape=(10, 3), dtype=float32)


In [37]:
df = df.assign(feature1_log = np.log10(df['feature1']))
df

Unnamed: 0,feature1,feature2,feature3,feature1_rolling,target,feature1_log
0,0.827965,0.231482,0.266384,0.827965,0,-0.081988
1,0.788115,0.019544,0.950652,0.80804,1,-0.103411
2,0.632887,0.999653,0.958807,0.710501,0,-0.198674
3,0.248602,0.791476,0.380484,0.440744,2,-0.604496
4,0.035784,0.571207,0.721888,0.142193,0,-1.44631
5,0.694578,0.264295,0.688943,0.365181,0,-0.158279
6,0.357552,0.381085,0.85805,0.526065,1,-0.446661
7,0.560881,0.572887,0.017695,0.459216,2,-0.251129
8,0.855202,0.770541,0.256315,0.708042,2,-0.067931
9,0.620974,0.48149,0.096609,0.738088,0,-0.206926


In [38]:
dataset = tf.data.Dataset.from_tensor_slices(net)
iterator = dataset.make_one_shot_iterator()
for x in iterator:
    print(x)

tf.Tensor([1.         0.8279646  0.68552536], shape=(3,), dtype=float32)
tf.Tensor([1.        0.7881145 0.6211245], shape=(3,), dtype=float32)
tf.Tensor([1.        0.6328872 0.4005462], shape=(3,), dtype=float32)
tf.Tensor([1.         0.24860156 0.06180274], shape=(3,), dtype=float32)
tf.Tensor([1.         0.03578405 0.0012805 ], shape=(3,), dtype=float32)
tf.Tensor([1.         0.69457835 0.48243907], shape=(3,), dtype=float32)
tf.Tensor([1.         0.35755193 0.12784338], shape=(3,), dtype=float32)
tf.Tensor([1.         0.560881   0.31458756], shape=(3,), dtype=float32)
tf.Tensor([1.         0.8552021  0.73137057], shape=(3,), dtype=float32)
tf.Tensor([1.         0.6209743  0.38560906], shape=(3,), dtype=float32)


In [39]:
# Feature crosses example
def make_crossed_example(latitude, longitude, labels):
    features = {'latitude': latitude.flatten(), 'longitude': longitude.flatten()}
    labels=labels.flatten()
    return tf.data.Dataset.from_tensor_slices((features, labels))

In [40]:
# Data input for estimator
def get_input_fn(dataset, num_epochs=None, n_batch=128, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
        x=pd.DataFrame({k: dataset[k].values for k in FEATURES}), 
        y = pd.Series(dataset[LABEL].values),
        batch_size=n_batch,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [41]:
def linear_regressor(df=df):
    feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]

    estimator = tf.estimator.LinearRegressor(feature_columns=feature_cols, model_dir="train")
    
    if tf.gfile.Exists('train'):
        try:
            tf.gfile.DeleteRecursively('train')
        except:
            pass
    
    estimator.train(input_fn=get_input_fn(df, num_epochs=None, n_batch = 2, shuffle=False), steps=1000)
    ev = estimator.evaluate(input_fn=get_input_fn(df, num_epochs=1, n_batch = 128, shuffle=False))
    
    print(f'Loss: {ev["loss"]}')

In [42]:
# Linear regression with one source feature
FEATURES = ['feature1']
LABEL = 'target'

linear_regressor()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a31a78dd8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


In [43]:
# Linear regression with 3 source features
FEATURES = ['feature1', 'feature2', 'feature3']
LABEL = 'target'

linear_regressor()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a31675f28>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


In [44]:
# Linear regression with all 3 source features, rolling feature, and log feature
FEATURES = ['feature1', 'feature2', 'feature3', 'feature1_rolling', 'feature1_log']
LABEL = 'target'

linear_regressor()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2ff938d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


In [45]:
# Retry with polynomial features
p = PolynomialFeatures(2).fit(df[['feature1', 'feature2', 'feature3']])
df_poly = pd.DataFrame(p.transform(df[['feature1', 'feature2', 'feature3']]), columns=p.get_feature_names(df.columns))
df_poly['target'] = df['target']
df_poly.columns = df_poly.columns.str.replace(' ', '_').str.replace('^', '_')
df_poly

Unnamed: 0,1,feature1,feature2,feature3,feature1_2,feature1_feature2,feature1_feature3,feature2_2,feature2_feature3,feature3_2,target
0,1.0,0.827965,0.231482,0.266384,0.685525,0.191659,0.220557,0.053584,0.061663,0.070961,0
1,1.0,0.788115,0.019544,0.950652,0.621124,0.015403,0.749223,0.000382,0.018579,0.903739,1
2,1.0,0.632887,0.999653,0.958807,0.400546,0.632668,0.606817,0.999307,0.958475,0.919311,0
3,1.0,0.248602,0.791476,0.380484,0.061803,0.196762,0.094589,0.626434,0.301144,0.144768,2
4,1.0,0.035784,0.571207,0.721888,0.00128,0.02044,0.025832,0.326277,0.412347,0.521122,0
5,1.0,0.694578,0.264295,0.688943,0.482439,0.183574,0.478525,0.069852,0.182084,0.474642,0
6,1.0,0.357552,0.381085,0.85805,0.127843,0.136258,0.306797,0.145226,0.32699,0.736249,1
7,1.0,0.560881,0.572887,0.017695,0.314588,0.321321,0.009925,0.328199,0.010137,0.000313,2
8,1.0,0.855202,0.770541,0.256315,0.731371,0.658968,0.219201,0.593733,0.197501,0.065697,2
9,1.0,0.620974,0.48149,0.096609,0.385609,0.298993,0.059992,0.231833,0.046516,0.009333,0


In [46]:
FEATURES = list(df_poly)[:-1]

linear_regressor(df_poly)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'train', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2fd588d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
