In [30]:
import tensorflow as tf
import pandas as pd

tf.logging.set_verbosity(tf.logging.INFO)

In [31]:
# reading the engineered datasets
df_train = pd.read_csv('../input/train.csv') # training set after train/test split
df_valid = pd.read_csv('../input/valid.csv') # testing set after train/test split
df_infer = pd.read_csv('../input/infer.csv') # inference set provided in Kaggle

In [32]:
df_train['AMT_GOODS_PRICE'].value_counts()

6.0    35677
1.0    33822
4.0    29235
9.0    26049
8.0    25451
0.0    23411
2.0    22399
5.0    19620
3.0    17604
7.0    12736
Name: AMT_GOODS_PRICE, dtype: int64

In [33]:
# list with the numerical features
NUMERIC_FEATURES = [
    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',
]

# dictionary with the categorical features
CATEGORICAL_FEATURES = [
    tf.feature_column.categorical_column_with_vocabulary_list(key='CODE_GENDER', 
                                                              vocabulary_list=('M','F')),
    tf.feature_column.categorical_column_with_vocabulary_list(key='NAME_INCOME_TYPE', 
                                                              vocabulary_list=(
                                                                  'Working',
                                                                  'Commercial associate',
                                                                  'Pensioner',
                                                                  'State servant',
                                                                  'OTHER'
                                                              )),
    tf.feature_column.categorical_column_with_vocabulary_list(key='NAME_FAMILY_STATUS', 
                                                              vocabulary_list=(
                                                                  'Married',
                                                                  'Single / not married',
                                                                  'Civil marriage',
                                                                  'Separated',
                                                                  'Widow'
                                                              )),
    tf.feature_column.categorical_column_with_vocabulary_list(key='NAME_CONTRACT_TYPE', 
                                                              vocabulary_list=(
                                                                  'Cash loans',
                                                                  'Revolving loans'
                                                              )),
    tf.feature_column.categorical_column_with_vocabulary_list(key='FLAG_OWN_CAR', 
                                                              vocabulary_list=(
                                                                  'Y',
                                                                  'N'
                                                              )),
    tf.feature_column.categorical_column_with_vocabulary_list(key='FLAG_OWN_REALTY', 
                                                              vocabulary_list=(
                                                                  'Y',
                                                                  'N'
                                                              )),
    tf.feature_column.categorical_column_with_identity(key='CNT_CHILDREN', num_buckets=5),
    tf.feature_column.categorical_column_with_identity(key='CNT_FAM_MEMBERS', num_buckets=7),
    #tf.feature_column.categorical_column_with_identity(key='AMT_REQ_CREDIT_BUREAU_HOUR', num_buckets=3),
    #tf.feature_column.categorical_column_with_identity(key='AMT_REQ_CREDIT_BUREAU_DAY', num_buckets=3),
    tf.feature_column.categorical_column_with_identity(key='AMT_REQ_CREDIT_BUREAU_WEEK', num_buckets=3),
    tf.feature_column.categorical_column_with_identity(key='AMT_REQ_CREDIT_BUREAU_MON', num_buckets=11),
    tf.feature_column.categorical_column_with_identity(key='AMT_REQ_CREDIT_BUREAU_QRT', num_buckets=7),
    tf.feature_column.categorical_column_with_identity(key='AMT_REQ_CREDIT_BUREAU_YEAR', num_buckets=7),
    tf.feature_column.categorical_column_with_identity(key='OBS_30_CNT_SOCIAL_CIRCLE', num_buckets=18),
    tf.feature_column.categorical_column_with_identity(key='OBS_60_CNT_SOCIAL_CIRCLE', num_buckets=18),
    tf.feature_column.categorical_column_with_identity(key='DEF_30_CNT_SOCIAL_CIRCLE', num_buckets=6),
    tf.feature_column.categorical_column_with_identity(key='DEF_60_CNT_SOCIAL_CIRCLE', num_buckets=5),
    tf.feature_column.categorical_column_with_identity(key='AMT_INCOME_TOTAL', num_buckets=10),
    tf.feature_column.categorical_column_with_identity(key='AMT_CREDIT', num_buckets=10),
    #tf.feature_column.categorical_column_with_identity(key='AMT_ANNUITY', num_buckets=10),
    #tf.feature_column.categorical_column_with_identity(key='AMT_GOODS_PRICE', num_buckets=10),
    tf.feature_column.categorical_column_with_identity(key='DAYS_BIRTH', num_buckets=10),
    tf.feature_column.categorical_column_with_identity(key='DAYS_ID_PUBLISH', num_buckets=10),
    tf.feature_column.categorical_column_with_identity(key='DAYS_LAST_PHONE_CHANGE', num_buckets=8),
    tf.feature_column.categorical_column_with_identity(key='DAYS_REGISTRATION', num_buckets=10),
    tf.feature_column.categorical_column_with_identity(key='OWN_CAR_AGE', num_buckets=10),
    tf.feature_column.categorical_column_with_identity(key='REGION_POPULATION_RELATIVE', num_buckets=10)
]

# initializing the LABEL and INPUT_COLUMNS variables
LABEL = 'TARGET'
INPUT_COLUMNS = []

# adding the numeric features to the tensorflow graph
for feature in NUMERIC_FEATURES:
    INPUT_COLUMNS.append(tf.feature_column.numeric_column(feature))
    
# adding the categorical features to the tensorflow graph
for feature in CATEGORICAL_FEATURES:
    INPUT_COLUMNS.append(tf.feature_column.indicator_column(feature))

In [34]:
# defining an train input function which feeds the training pandas dataframe
def input_fn_train(df, num_epochs):
    '''
    inputs:
    df - training set after train/test split
    
    output:
    minibatches of x,y
    '''
    return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = df[LABEL],
    batch_size = 400,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

In [35]:
# defining an prediction input function which feeds the inference pandas dataframe
def input_fn_eval(df, num_epochs):
    '''
    inputs:
    df - testing set after train/test split
    
    output:
    minibatches of x,y
    '''
    return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = df[LABEL],
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

In [36]:
# defining an prediction input function which feeds the inference pandas dataframe
def input_fn_infer(df, num_epochs):
    '''
    inputs:
    df - inference dataset provided in Kaggle
    
    output:
    minibatches of x,None
    '''
    return tf.estimator.inputs.pandas_input_fn(
    x = df,
    y = None,
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

In [37]:
OUTDIR = 'C:\\Log'

file_writer = tf.summary.FileWriter(OUTDIR)

estimator=tf.estimator.DNNClassifier(
                        activation_fn=tf.nn.relu,
                        hidden_units=[6,6,6,6], 
                        feature_columns=INPUT_COLUMNS, 
                        model_dir=OUTDIR,
                        n_classes=2,
                        optimizer=tf.train.AdamOptimizer(learning_rate=0.00001),
                        dropout=0.2,
                        loss_reduction='weighted_sum')

train_spec=tf.estimator.TrainSpec(
                       input_fn = input_fn_train(df_train, 5),
                       max_steps = 1000000)

eval_spec=tf.estimator.EvalSpec(
                       input_fn = input_fn_eval(df_valid, 5),
                       steps = None,
                       start_delay_secs = 1, # start evaluating after N seconds
                       throttle_secs = 10)  # evaluate every N seconds

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Log', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001FD417AD2B0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 10 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.


ValueError: Invalid input, not integer. key: AMT_REQ_CREDIT_BUREAU_HOUR dtype: <dtype: 'float64'>