In [1]:
import matplotlib as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data_folder = 'data/'
dataset= pd.read_csv(data_folder+'data.csv')

# I. Helping Steps to get through the implementation
### - Reading the dataset
### - Visualizing Uncleaned Data from the dataset.
### - Acknowledging the Labels
### - Acknowledging the Features

### - Cleaning the data
### - Building the Pipeline to clean the data from the dataset.
### - Visualizing cleaned data from the dataset.

## Features/Labels
### - labels | 1/0
### - 5 numerical features
### - 8 categorical features

In [3]:
dataset.head(50)

Unnamed: 0,age,workSector,education,educationNum,statusMarriage,career,relationship,race,sex,gainedCapital,lostCapital,hoursPerWeek,country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


# II. Data Preprocessing

In [4]:
# Creating the DataFrame for the preprocessed cleaned Dataset
cleaned_dataset = pd.DataFrame(columns=dataset.columns)

In [5]:
# Categorizing Features 
numerical_features = ['age','educationNum','gainedCapital','lostCapital','hoursPerWeek']
categorical_features = ['workSector','education','statusMarriage','career','relationship','race','sex','country']
label = ['income']
label_encoders = {}

In [6]:
def updating_series(dataset,feature,updated_feature):
    dataset[feature] = updated_feature

# Cleaning Categorical Features
[updating_series(dataset, feature, dataset[feature].apply(str.strip)) for feature in categorical_features]
dataset.head()

Unnamed: 0,age,workSector,education,educationNum,statusMarriage,career,relationship,race,sex,gainedCapital,lostCapital,hoursPerWeek,country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
# Repopulating cleaned dataset
for feature in dataset.columns:
    if feature in numerical_features:
        cleaned_dataset[feature] = dataset[feature].astype(float)
        
# Transforming categorical features to one hot encoded
for feature in dataset.columns:
    if feature in categorical_features+label:
        label_encoders[feature] = LabelEncoder().fit(dataset[feature].astype(str))

# Repopulating cleaned dataset with the one hot encoded features
for feature in label_encoders.keys():
    cleaned_dataset[feature] = label_encoders[feature].transform(dataset[feature].astype(str))

In [8]:
# Visualizing Dataset
cleaned_dataset.head(50)

Unnamed: 0,age,workSector,education,educationNum,statusMarriage,career,relationship,race,sex,gainedCapital,lostCapital,hoursPerWeek,country,income
0,39.0,5,9,13.0,4,0,1,4,1,2174.0,0.0,40.0,38,0
1,50.0,4,9,13.0,2,3,0,4,1,0.0,0.0,13.0,38,0
2,38.0,2,11,9.0,0,5,1,4,1,0.0,0.0,40.0,38,0
3,53.0,2,1,7.0,2,5,0,2,1,0.0,0.0,40.0,38,0
4,28.0,2,9,13.0,2,9,5,2,0,0.0,0.0,40.0,4,0
5,37.0,2,12,14.0,2,3,5,4,0,0.0,0.0,40.0,38,0
6,49.0,2,6,5.0,3,7,1,2,0,0.0,0.0,16.0,22,0
7,52.0,4,11,9.0,2,3,0,4,1,0.0,0.0,45.0,38,1
8,31.0,2,12,14.0,4,9,1,4,0,14084.0,0.0,50.0,38,1
9,42.0,2,9,13.0,2,3,0,4,1,5178.0,0.0,40.0,38,1


In [9]:
# Removing outliers to enhance the dataset's features correlation importance during the training 
# part of the model's creation pipeline
    
print('Removing Outliers')

clf = IsolationForest(max_samples = 100, random_state = 0)
clf.fit(cleaned_dataset)
y_noano = clf.predict(cleaned_dataset)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

cleaned_dataset = cleaned_dataset.iloc[y_noano[y_noano['Top'] == 1].index.values]
cleaned_dataset.reset_index(drop = True, inplace = True)
print("Number of Outliers: {}".format(y_noano[y_noano['Top'] == -1].shape[0]))
print("Number of rows without outliers: {}".format(cleaned_dataset.shape[0]))

Removing Outliers
Number of Outliers: 8474
Number of rows without outliers: 36748


In [10]:
# Creating Features and Label Datasets
Y = cleaned_dataset.pop(label[0])
X = cleaned_dataset

In [11]:
# Creating Train, Val, Test Datasets
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X, Y, test_size=0.30)

X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.30)

In [12]:
# Saving data due to outliers removal from the datasets

print('Saving Train/Val/Test Datasets')

X_train.to_csv(data_folder+'X_train.csv',index=False,header=1)
X_val.to_csv(data_folder+'X_val.csv',index=False,header=1)
X_test.to_csv(data_folder+'X_test.csv',index=False,header=1)

Y_train.to_csv(data_folder+'Y_train.csv',index=False,header=1)
Y_val.to_csv(data_folder+'Y_val.csv',index=False,header=1)
Y_test.to_csv(data_folder+'Y_test.csv',index=False,header=1)

Saving Train/Val/Test Datasets


# III. Implementing the classification model 
## Tensorflow implementation due to feature layer transformations 

In [13]:
#Creating Feature Columns with Tensorflow Objects
def create_feature_columns(encoders):

    numeric_columns = {feature: tf.feature_column.numeric_column(feature) for feature in numerical_features}

    categorical_columns_with_vocabulary_list = {item: tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key=item, vocabulary_list=encoders[item].classes_))
        for item in categorical_features}

    feature_columns = {}
    feature_columns.update(numeric_columns)
    feature_columns.update(categorical_columns_with_vocabulary_list)


    return feature_columns


# Creating the input function for the training of the model with Tensorflow
def build_input_fn_train():

    data_train = pd.read_csv(data_folder+'X_train.csv')
    data_train_label = pd.read_csv(data_folder+'Y_train.csv')
    
    [updating_series(data_train,feature,data_train[feature].astype(str)) for feature in categorical_features]
    
    return tf.estimator.inputs.pandas_input_fn(
        x= data_train,
        y= data_train_label.income,
        batch_size=128,
        num_epochs=50,
        shuffle=True,
        num_threads=8) 


# Creating the input function for the evaluation of the model with Tensorflow
def build_input_fn_val():
    
    data_val = pd.read_csv(data_folder+'X_val.csv')
    data_val_label = pd.read_csv(data_folder+'Y_val.csv')

    [updating_series(data_val,feature,data_val[feature].astype(str)) for feature in categorical_features]

    return tf.estimator.inputs.pandas_input_fn(
        x= data_val,
        y= data_val_label.income,
        shuffle=False)


# Creating the input function for the predictions of the model with Tensorflow
def build_input_fn_test():
    
    data_test = pd.read_csv(data_folder+'X_test.csv')
    data_test_label = pd.read_csv(data_folder+'Y_test.csv')

    [updating_series(data_test,feature,data_test[feature].astype(str)) for feature in categorical_features]

    return tf.estimator.inputs.pandas_input_fn(
        x= data_test,
        shuffle=False)

#Creating input receiver function for model export
def create_input_receiver_serving():
    def input_receiver():

        inputs = {}

        for column_name in numerical_features+categorical_features:
            if column_name in categorical_features:
                inputs[column_name] = tf.placeholder(shape=[None], dtype=tf.string)
            else:
                inputs[column_name] = tf.placeholder(shape=[None], dtype=tf.float32)

        features_dict = {key: tf.expand_dims(tensor, -1)for key, tensor in inputs.items()}

        from pprint import pprint
        pprint("****************************************")
        pprint("serving_feature_columns")
        pprint(features_dict)
        pprint("****************************************")

        return tf.estimator.export.ServingInputReceiver(
            features=features_dict,
            receiver_tensors= inputs
        )
        
    return input_receiver

In [14]:
# Creating Optimizer
def create_optimizer():
    return lambda: tf.train.AdamOptimizer(tf.train.exponential_decay(learning_rate=0.0001,global_step=tf.train.get_global_step(),decay_steps=8888,decay_rate=0.98))

In [15]:
# Creating classifier model
def create_regressor(encoders):

    estimator = tf.estimator.DNNLinearCombinedClassifier(
        model_dir='/classifier_model/',
        n_classes=2,
        dnn_feature_columns=create_feature_columns(label_encoders).values(),
        dnn_hidden_units=[512,512],
        dnn_optimizer=create_optimizer(),
        dnn_activation_fn=tf.nn.relu,
    )
    
    return estimator

In [16]:
estimator = create_regressor(label_encoders)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/classifier_model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002BAFD97D828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
estimator.train(input_fn=build_input_fn_train())

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /classifier_mo

INFO:tensorflow:global_step/sec: 129.518
INFO:tensorflow:loss = 40.471035, step = 3600 (0.788 sec)
INFO:tensorflow:global_step/sec: 134.29
INFO:tensorflow:loss = 122.66836, step = 3700 (0.729 sec)
INFO:tensorflow:global_step/sec: 133.841
INFO:tensorflow:loss = 56.460136, step = 3800 (0.747 sec)
INFO:tensorflow:global_step/sec: 126.827
INFO:tensorflow:loss = 81.80437, step = 3900 (0.788 sec)
INFO:tensorflow:global_step/sec: 131.861
INFO:tensorflow:loss = 68.657166, step = 4000 (0.758 sec)
INFO:tensorflow:global_step/sec: 130.697
INFO:tensorflow:loss = 56.672775, step = 4100 (0.767 sec)
INFO:tensorflow:global_step/sec: 127.251
INFO:tensorflow:loss = 133.19765, step = 4200 (0.786 sec)
INFO:tensorflow:global_step/sec: 126.846
INFO:tensorflow:loss = 112.670265, step = 4300 (0.786 sec)
INFO:tensorflow:global_step/sec: 130.158
INFO:tensorflow:loss = 131.258, step = 4400 (0.768 sec)
INFO:tensorflow:global_step/sec: 128.122
INFO:tensorflow:loss = 282.70734, step = 4500 (0.781 sec)
INFO:tensorfl

INFO:tensorflow:loss = 83.20961, step = 11900 (0.752 sec)
INFO:tensorflow:global_step/sec: 124.469
INFO:tensorflow:loss = 49.61675, step = 12000 (0.805 sec)
INFO:tensorflow:global_step/sec: 129.534
INFO:tensorflow:loss = 91.11148, step = 12100 (0.772 sec)
INFO:tensorflow:global_step/sec: 128.747
INFO:tensorflow:loss = 56.42181, step = 12200 (0.775 sec)
INFO:tensorflow:global_step/sec: 131.802
INFO:tensorflow:loss = 57.044144, step = 12300 (0.759 sec)
INFO:tensorflow:global_step/sec: 130.553
INFO:tensorflow:loss = 68.85719, step = 12400 (0.766 sec)
INFO:tensorflow:global_step/sec: 137.491
INFO:tensorflow:loss = 47.98735, step = 12500 (0.743 sec)
INFO:tensorflow:global_step/sec: 128.553
INFO:tensorflow:loss = 59.005074, step = 12600 (0.762 sec)
INFO:tensorflow:global_step/sec: 136.543
INFO:tensorflow:loss = 91.89854, step = 12700 (0.748 sec)
INFO:tensorflow:global_step/sec: 124.137
INFO:tensorflow:loss = 66.460754, step = 12800 (0.790 sec)
INFO:tensorflow:global_step/sec: 132.938
INFO:te

INFO:tensorflow:global_step/sec: 133.613
INFO:tensorflow:loss = 57.69274, step = 20200 (0.748 sec)
INFO:tensorflow:global_step/sec: 131.459
INFO:tensorflow:loss = 81.205475, step = 20300 (0.762 sec)
INFO:tensorflow:global_step/sec: 132.656
INFO:tensorflow:loss = 94.825676, step = 20400 (0.753 sec)
INFO:tensorflow:global_step/sec: 135.722
INFO:tensorflow:loss = 55.82306, step = 20500 (0.737 sec)
INFO:tensorflow:global_step/sec: 132.911
INFO:tensorflow:loss = 69.7955, step = 20600 (0.752 sec)
INFO:tensorflow:global_step/sec: 131.708
INFO:tensorflow:loss = 49.738167, step = 20700 (0.759 sec)
INFO:tensorflow:global_step/sec: 136.177
INFO:tensorflow:loss = 51.494736, step = 20800 (0.734 sec)
INFO:tensorflow:global_step/sec: 131.546
INFO:tensorflow:loss = 52.761414, step = 20900 (0.760 sec)
INFO:tensorflow:global_step/sec: 134.067
INFO:tensorflow:loss = 60.729553, step = 21000 (0.746 sec)
INFO:tensorflow:global_step/sec: 134.663
INFO:tensorflow:loss = 47.537407, step = 21100 (0.743 sec)
INFO

INFO:tensorflow:loss = 41.68866, step = 28400 (0.737 sec)
INFO:tensorflow:global_step/sec: 134.023
INFO:tensorflow:loss = 53.41827, step = 28500 (0.731 sec)
INFO:tensorflow:global_step/sec: 133.532
INFO:tensorflow:loss = 47.19448, step = 28600 (0.751 sec)
INFO:tensorflow:global_step/sec: 135.332
INFO:tensorflow:loss = 48.22196, step = 28700 (0.737 sec)
INFO:tensorflow:global_step/sec: 133.889
INFO:tensorflow:loss = 59.69918, step = 28800 (0.747 sec)
INFO:tensorflow:global_step/sec: 132.084
INFO:tensorflow:loss = 45.14501, step = 28900 (0.757 sec)
INFO:tensorflow:global_step/sec: 131.848
INFO:tensorflow:loss = 49.848717, step = 29000 (0.758 sec)
INFO:tensorflow:global_step/sec: 136.171
INFO:tensorflow:loss = 69.30787, step = 29100 (0.734 sec)
INFO:tensorflow:global_step/sec: 132.304
INFO:tensorflow:loss = 59.095467, step = 29200 (0.756 sec)
INFO:tensorflow:global_step/sec: 130.519
INFO:tensorflow:loss = 55.938263, step = 29300 (0.766 sec)
INFO:tensorflow:global_step/sec: 135.122
INFO:te

INFO:tensorflow:global_step/sec: 125.553
INFO:tensorflow:loss = 40.957813, step = 36700 (0.796 sec)
INFO:tensorflow:global_step/sec: 125.628
INFO:tensorflow:loss = 75.23714, step = 36800 (0.796 sec)
INFO:tensorflow:global_step/sec: 125.592
INFO:tensorflow:loss = 55.118748, step = 36900 (0.797 sec)
INFO:tensorflow:global_step/sec: 125.471
INFO:tensorflow:loss = 80.390976, step = 37000 (0.796 sec)
INFO:tensorflow:global_step/sec: 128.176
INFO:tensorflow:loss = 57.51267, step = 37100 (0.780 sec)
INFO:tensorflow:global_step/sec: 127.115
INFO:tensorflow:loss = 53.427246, step = 37200 (0.787 sec)
INFO:tensorflow:global_step/sec: 128.7
INFO:tensorflow:loss = 59.78607, step = 37300 (0.777 sec)
INFO:tensorflow:global_step/sec: 130.18
INFO:tensorflow:loss = 43.6457, step = 37400 (0.768 sec)
INFO:tensorflow:global_step/sec: 131.03
INFO:tensorflow:loss = 50.769196, step = 37500 (0.763 sec)
INFO:tensorflow:global_step/sec: 129.061
INFO:tensorflow:loss = 44.286438, step = 37600 (0.775 sec)
INFO:tens

INFO:tensorflow:global_step/sec: 133.022
INFO:tensorflow:loss = 45.260376, step = 45000 (0.752 sec)
INFO:tensorflow:global_step/sec: 131.548
INFO:tensorflow:loss = 45.623604, step = 45100 (0.760 sec)
INFO:tensorflow:global_step/sec: 130.358
INFO:tensorflow:loss = 49.10374, step = 45200 (0.767 sec)
INFO:tensorflow:global_step/sec: 133.334
INFO:tensorflow:loss = 43.430084, step = 45300 (0.750 sec)
INFO:tensorflow:global_step/sec: 134.154
INFO:tensorflow:loss = 52.953938, step = 45400 (0.745 sec)
INFO:tensorflow:global_step/sec: 136.114
INFO:tensorflow:loss = 44.712563, step = 45500 (0.735 sec)
INFO:tensorflow:global_step/sec: 133.852
INFO:tensorflow:loss = 45.975536, step = 45600 (0.747 sec)
INFO:tensorflow:global_step/sec: 136.17
INFO:tensorflow:loss = 63.822598, step = 45700 (0.750 sec)
INFO:tensorflow:global_step/sec: 132.578
INFO:tensorflow:loss = 53.475937, step = 45800 (0.739 sec)
INFO:tensorflow:global_step/sec: 134.641
INFO:tensorflow:loss = 61.8088, step = 45900 (0.743 sec)
INFO

INFO:tensorflow:loss = 45.80928, step = 53200 (0.769 sec)
INFO:tensorflow:global_step/sec: 129.046
INFO:tensorflow:loss = 59.83551, step = 53300 (0.775 sec)
INFO:tensorflow:global_step/sec: 127.38
INFO:tensorflow:loss = 50.801056, step = 53400 (0.786 sec)
INFO:tensorflow:global_step/sec: 127.564
INFO:tensorflow:loss = 48.62789, step = 53500 (0.784 sec)
INFO:tensorflow:global_step/sec: 130.238
INFO:tensorflow:loss = 44.65753, step = 53600 (0.768 sec)
INFO:tensorflow:global_step/sec: 131.056
INFO:tensorflow:loss = 32.854805, step = 53700 (0.763 sec)
INFO:tensorflow:global_step/sec: 131.597
INFO:tensorflow:loss = 52.02242, step = 53800 (0.758 sec)
INFO:tensorflow:global_step/sec: 127.832
INFO:tensorflow:loss = 45.18107, step = 53900 (0.783 sec)
INFO:tensorflow:global_step/sec: 133.038
INFO:tensorflow:loss = 57.773365, step = 54000 (0.753 sec)
INFO:tensorflow:global_step/sec: 134.026
INFO:tensorflow:loss = 38.867477, step = 54100 (0.744 sec)
INFO:tensorflow:global_step/sec: 130.225
INFO:te

INFO:tensorflow:global_step/sec: 128.534
INFO:tensorflow:loss = 61.259422, step = 61500 (0.778 sec)
INFO:tensorflow:global_step/sec: 129.309
INFO:tensorflow:loss = 70.32269, step = 61600 (0.773 sec)
INFO:tensorflow:global_step/sec: 129.701
INFO:tensorflow:loss = 60.73856, step = 61700 (0.771 sec)
INFO:tensorflow:global_step/sec: 130.969
INFO:tensorflow:loss = 61.54658, step = 61800 (0.764 sec)
INFO:tensorflow:global_step/sec: 128.369
INFO:tensorflow:loss = 53.217255, step = 61900 (0.779 sec)
INFO:tensorflow:global_step/sec: 133.159
INFO:tensorflow:loss = 80.91215, step = 62000 (0.749 sec)
INFO:tensorflow:global_step/sec: 129.718
INFO:tensorflow:loss = 48.27944, step = 62100 (0.774 sec)
INFO:tensorflow:global_step/sec: 129.334
INFO:tensorflow:loss = 46.20661, step = 62200 (0.772 sec)
INFO:tensorflow:global_step/sec: 131.819
INFO:tensorflow:loss = 59.738396, step = 62300 (0.759 sec)
INFO:tensorflow:global_step/sec: 136.145
INFO:tensorflow:loss = 52.90062, step = 62400 (0.733 sec)
INFO:te

INFO:tensorflow:loss = 48.731926, step = 69700 (0.751 sec)
INFO:tensorflow:global_step/sec: 136.455
INFO:tensorflow:loss = 45.428066, step = 69800 (0.748 sec)
INFO:tensorflow:global_step/sec: 128.032
INFO:tensorflow:loss = 48.84662, step = 69900 (0.767 sec)
INFO:tensorflow:global_step/sec: 129.666
INFO:tensorflow:loss = 56.54732, step = 70000 (0.771 sec)
INFO:tensorflow:global_step/sec: 128.595
INFO:tensorflow:loss = 47.36698, step = 70100 (0.778 sec)
INFO:tensorflow:global_step/sec: 133.476
INFO:tensorflow:loss = 46.57224, step = 70200 (0.747 sec)
INFO:tensorflow:global_step/sec: 130.612
INFO:tensorflow:loss = 48.841187, step = 70300 (0.766 sec)
INFO:tensorflow:global_step/sec: 135.138
INFO:tensorflow:loss = 45.255222, step = 70400 (0.742 sec)
INFO:tensorflow:global_step/sec: 131.125
INFO:tensorflow:loss = 35.3712, step = 70500 (0.762 sec)
INFO:tensorflow:global_step/sec: 131.579
INFO:tensorflow:loss = 46.146503, step = 70600 (0.761 sec)
INFO:tensorflow:global_step/sec: 136.723
INFO:t

INFO:tensorflow:Saving checkpoints for 77966 into /classifier_model/model.ckpt.
INFO:tensorflow:global_step/sec: 110.62
INFO:tensorflow:loss = 34.748604, step = 78000 (0.906 sec)
INFO:tensorflow:global_step/sec: 128.533
INFO:tensorflow:loss = 50.715088, step = 78100 (0.774 sec)
INFO:tensorflow:global_step/sec: 123.862
INFO:tensorflow:loss = 50.153725, step = 78200 (0.809 sec)
INFO:tensorflow:global_step/sec: 130.719
INFO:tensorflow:loss = 48.385155, step = 78300 (0.765 sec)
INFO:tensorflow:global_step/sec: 129.702
INFO:tensorflow:loss = 49.95652, step = 78400 (0.770 sec)
INFO:tensorflow:global_step/sec: 131.752
INFO:tensorflow:loss = 47.50709, step = 78500 (0.759 sec)
INFO:tensorflow:global_step/sec: 129.87
INFO:tensorflow:loss = 47.92882, step = 78600 (0.771 sec)
INFO:tensorflow:global_step/sec: 130.3
INFO:tensorflow:loss = 51.316338, step = 78700 (0.781 sec)
INFO:tensorflow:global_step/sec: 126.238
INFO:tensorflow:loss = 43.402718, step = 78800 (0.779 sec)
INFO:tensorflow:global_step

<tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier at 0x2bafd97dfd0>

In [18]:
estimator.evaluate(input_fn=build_input_fn_val())

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-01T15:33:03Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /classifier_model/model.ckpt-80385
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-07-01-15:33:04
INFO:tensorflow:Saving dict for global step 80385: accuracy = 0.8302449, accuracy_baseline = 0.785668, auc = 0.8324698, auc_precision_recall = 0.6172307, average_loss = 0.3882287, global_step = 80385, label/mean = 0.214332, loss = 49.114113, precision = 0.7171717, prediction/mean = 0.21057028, recall = 0.34340993
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 80385: /classifier_model/model.ckpt-80385


{'accuracy': 0.8302449,
 'accuracy_baseline': 0.785668,
 'auc': 0.8324698,
 'auc_precision_recall': 0.6172307,
 'average_loss': 0.3882287,
 'label/mean': 0.214332,
 'loss': 49.114113,
 'precision': 0.7171717,
 'prediction/mean': 0.21057028,
 'recall': 0.34340993,
 'global_step': 80385}

In [19]:
# Verifying accuracy of predictions on the test dataset
predictions = estimator.predict(input_fn=build_input_fn_test())
preds = [item['classes'] for item in predictions]

results = pd.read_csv(data_folder+'Y_test.csv')
results['predictions'] = [int(item[0]) for item in preds]
results['valid_predictions'] = results.apply(lambda x: 1 if x.income == x.predictions else 0,axis=1)
results_count = results.valid_predictions.value_counts()
percentage = (results_count[1] * 100)/(results_count[0] + results_count[1] )

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /classifier_model/model.ckpt-80385
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [21]:
percentage

83.25272067714631

In [23]:
estimator.export_savedmodel('saved_model', serving_input_receiver_fn=create_input_receiver_serving())

Instructions for updating:
This function has been renamed, use `export_saved_model` instead.
'****************************************'
'serving_feature_columns'
{'age': <tf.Tensor 'ExpandDims:0' shape=(?, 1) dtype=float32>,
 'career': <tf.Tensor 'ExpandDims_8:0' shape=(?, 1) dtype=string>,
 'country': <tf.Tensor 'ExpandDims_12:0' shape=(?, 1) dtype=string>,
 'education': <tf.Tensor 'ExpandDims_6:0' shape=(?, 1) dtype=string>,
 'educationNum': <tf.Tensor 'ExpandDims_1:0' shape=(?, 1) dtype=float32>,
 'gainedCapital': <tf.Tensor 'ExpandDims_2:0' shape=(?, 1) dtype=float32>,
 'hoursPerWeek': <tf.Tensor 'ExpandDims_4:0' shape=(?, 1) dtype=float32>,
 'lostCapital': <tf.Tensor 'ExpandDims_3:0' shape=(?, 1) dtype=float32>,
 'race': <tf.Tensor 'ExpandDims_10:0' shape=(?, 1) dtype=string>,
 'relationship': <tf.Tensor 'ExpandDims_9:0' shape=(?, 1) dtype=string>,
 'sex': <tf.Tensor 'ExpandDims_11:0' shape=(?, 1) dtype=string>,
 'statusMarriage': <tf.Tensor 'ExpandDims_7:0' shape=(?, 1) dtype=str

b'saved_model\\1593607395'