<a href="https://colab.research.google.com/github/trinade96/Fortiate_Project/blob/master/Numeric_vs_Image_Encoding(What_If_Tool).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### In this notebook we will be essentially looking to see if an image encoding works better than numeric encoding for our data. The way of the learning about the data will be neural networks. We will try some pretrained classic neural networks, some which are more suited for images, but to be fair we will apply the same structures to the numeric encoding too to see if we can gain an advantage. We will also try to get computation times and steps if possible.

##### **Packages Used**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, mean_absolute_error
from scipy.stats import chi2_contingency as chi_test
from sklearn.datasets import load_boston
from pandas.api.types import is_float_dtype
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Conv2D, Flatten, Dropout, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD,Adam
from keras.utils import np_utils
from tensorflow.keras import optimizers
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input as preprocess_inputV3

  import pandas.util.testing as tm
Using TensorFlow backend.


In [None]:
print(tf.__version__)

2.2.0


##### **Reading the data**

In [None]:
from google.colab import drive
#drive.flush_and_unmount()
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [None]:
data = pd.read_excel('/gdrive/My Drive/Random-Forest-Rules-Final-Rearranged-mindmap.xlsx')
data.head()

Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,BILLING_AMOUNT,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,TRANSACTION_AMOUNT,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_AMOUNT,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR
0,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,147.84,21-MAR-18 11.33.24,21-MAR-18 11.33.24,21-MAR-18 11.33.24,A1586941252148110,21-AUG-17 17.10.11,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,147.84,CURR356,0,145.22,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,800204,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05
1,CURR356,ATC9,TOC1,TC00,TVI0,MCC4814,NNN,UNK,399.0,21-MAR-18 11.34.15,21-MAR-18 11.34.15,21-MAR-18 11.34.15,A1586941241334010,06-APR-16 20.19.15,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,399.0,CURR356,0,391.94,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9822047692,PAYTM RETAIL -PG-ONLINE \\NOIDA\201301\,300255,RESULT000,0,AMI,ACS,T0797,AXASK05AEVV,ECI05
2,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,247.2,21-MAR-18 11.36.47,21-MAR-18 11.36.47,21-MAR-18 11.36.48,A1586941049519010,03-APR-18 14.19.03,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,247.2,CURR356,0,242.82,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700408,RESULT000,0,AMI,ACS,T0537,AXASK05AEVV,ECI05
3,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,359.0,21-MAR-18 11.39.44,21-MAR-18 11.39.44,21-MAR-18 11.39.44,A1586941050671000,20-APR-18 19.43.48,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,359.0,CURR356,0,352.65,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700585,RESULT000,0,AMI,ACS,T0796,AXASK05AEVV,ECI05
4,CURR356,ATC9,TOC1,TC00,TVI0,MCC5999,NNN,UNK,188.0,21-MAR-18 11.43.08,21-MAR-18 11.43.08,21-MAR-18 11.43.08,A1586941257961010,30-APR-16 20.49.02,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,188.0,CURR356,0,183.81,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9820524155,PAYTM.COM WALLET PG \\NOIDA\201310\\,400788,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05


In [None]:
data.shape

(105271, 44)

##### We will be treating the data a little differently here, not removing the NaNs but keeping them as a different category so it can be passed to the classifier so we don't loose too data

In [None]:
for col in data.keys():
  data[col] = data[col].fillna(str(col+'NA'))
  print(data[col].isnull().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [None]:
data.dropna(axis=0)
data.shape

(105271, 44)

##### Now we will move onto to removing the index like columns and then encoding the dataset to be passed on to the classifier.

##### **Function to remove columns which are a list on distinct values for eg. the index**

In [None]:
def remove_index_like(df):
  df_new = df.copy()
  indices = []
  for i in df_new.columns:
    if len(df[i]) == len(set(df[i])):
      indices.append(i)
  df_new = df_new.drop(indices,1)
  return df_new

In [None]:
data_dt = remove_index_like(data)

In [None]:
data_dt.shape

(105271, 44)

In [None]:
data_dt.head()

Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,BILLING_AMOUNT,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,TRANSACTION_AMOUNT,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_AMOUNT,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR
0,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,147.84,21-MAR-18 11.33.24,21-MAR-18 11.33.24,21-MAR-18 11.33.24,A1586941252148110,21-AUG-17 17.10.11,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,147.84,CURR356,0,145.22,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,800204,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05
1,CURR356,ATC9,TOC1,TC00,TVI0,MCC4814,NNN,UNK,399.0,21-MAR-18 11.34.15,21-MAR-18 11.34.15,21-MAR-18 11.34.15,A1586941241334010,06-APR-16 20.19.15,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,399.0,CURR356,0,391.94,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9822047692,PAYTM RETAIL -PG-ONLINE \\NOIDA\201301\,300255,RESULT000,0,AMI,ACS,T0797,AXASK05AEVV,ECI05
2,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,247.2,21-MAR-18 11.36.47,21-MAR-18 11.36.47,21-MAR-18 11.36.48,A1586941049519010,03-APR-18 14.19.03,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,247.2,CURR356,0,242.82,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700408,RESULT000,0,AMI,ACS,T0537,AXASK05AEVV,ECI05
3,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,359.0,21-MAR-18 11.39.44,21-MAR-18 11.39.44,21-MAR-18 11.39.44,A1586941050671000,20-APR-18 19.43.48,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,359.0,CURR356,0,352.65,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700585,RESULT000,0,AMI,ACS,T0796,AXASK05AEVV,ECI05
4,CURR356,ATC9,TOC1,TC00,TVI0,MCC5999,NNN,UNK,188.0,21-MAR-18 11.43.08,21-MAR-18 11.43.08,21-MAR-18 11.43.08,A1586941257961010,30-APR-16 20.49.02,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,188.0,CURR356,0,183.81,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9820524155,PAYTM.COM WALLET PG \\NOIDA\201310\\,400788,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05


##### So there are no index like columns here, so we can simply convert all of them via LabelEncoder to numeric encodings to be passed on to the decision tree, however we will first find out the float type columns and have them be as it is.


##### **Function to remove constant values**


In [None]:
def remove_constants(df):
  df_new = df.copy()
  indices = []
  for col in df_new.columns:
    if len(set(df_new[col])) == 1:
      indices.append(col)
  df_new = df_new.drop(indices,1)
  return df_new

In [None]:
data_dt = remove_constants(data_dt)
data_dt.shape

(105271, 44)

##### Therefore there are no constant columns in the dataset

In [None]:
data_dt.columns

Index(['TRANSACTION_CURRENCY', 'PD4', 'PD11', 'PROC_CODE_12', 'PD8', 'MCC',
       'POS_ENTRY_MODE', 'SERVICE_CODE', 'BILLING_AMOUNT',
       'TRANSACTION_LOCAL_DATE', 'TRANSMISSION_DATE_TIME',
       'RESPONSE_DATE_TIME', 'PAN', 'VALID_FROM', 'EXPIRY_DATE',
       'PROC_CODE_34', 'PROC_CODE_56', 'PD1', 'PD2', 'PD3', 'PD5', 'PD6',
       'PD7', 'PD9', 'PD10', 'PD12', 'TRANSACTION_AMOUNT', 'BILLING_CURRENCY',
       'CONVERSION_RATE_DATE', 'SETTLEMENT_AMOUNT', 'SETTLEMENT_CURRENCY',
       'SETTLEMENT_DATE', 'ACQUIRING_COUNTRY_CODE', 'CARD_ACCEPTOR_TERM_ID',
       'CARD_ACCEPTOR_ID', 'CARD_ACC_NAME_ADDRESS', 'AUTHORIZATION_CODE',
       'DECISION', 'TARGET', 'C_100', 'C_40', 'C_20', 'ECOM_INFO',
       'E_COM_INDICATOR'],
      dtype='object')

##### We will be using a network called Inception V3 which is a pretrained network and we will be training it with our data, starting with previously learned weights and we will be using the same on the Image Encoded data. Once again we will use 'E_COM_INDICATOR' as the labelling column.

##### **Trial on What-If Tool (with dummy dataset)**

In [None]:
#@title Install the What-If Tool widget if running in colab {display-mode: "form"}

try:
  import google.colab
  !pip install --upgrade witwidget
except:
  pass

Collecting witwidget
[?25l  Downloading https://files.pythonhosted.org/packages/a9/12/d61b3104cde5181e3340dbd7620885d4127f62e0ef4f738786e226683127/witwidget-1.7.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 2.0MB/s 
Installing collected packages: witwidget
Successfully installed witwidget-1.7.0


In [None]:


import pandas as pd
import numpy as np
import tensorflow as tf
import functools

# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns=None):
    feature_spec = {}
    if columns == None:
        columns = df.columns.values.tolist()
    for f in columns:
        if df[f].dtype is np.dtype(np.int64):
            feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.int64)
        elif df[f].dtype is np.dtype(np.float64):
            feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.float32)
        else:
            feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.string)
    return feature_spec

# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(columns, feature_spec):
    ret = []
    for col in columns:
        if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
            ret.append(tf.feature_column.numeric_column(col))
        else:
            ret.append(tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))
    return ret

# An input function for providing input to a model from tf.Examples
def tfexamples_input_fn(examples, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
                       num_epochs=None, 
                       batch_size=64):
    def ex_generator():
        for i in range(len(examples)):
            yield examples[i].SerializeToString()
    dataset = tf.data.Dataset.from_generator(
      ex_generator, tf.dtypes.string, tf.TensorShape([]))
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
    dataset = dataset.repeat(num_epochs)
    return dataset

# Parses Tf.Example protos into features for the input function.
def parse_tf_example(example_proto, label, feature_spec):
    parsed_features = tf.io.parse_example(serialized=example_proto, features=feature_spec)
    target = parsed_features.pop(label)
    return parsed_features, target

# Converts a dataframe into a list of tf.Example protos.
def df_to_examples(df, columns=None):
    examples = []
    if columns == None:
        columns = df.columns.values.tolist()
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in columns:
            if df[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(int(row[col]))
            elif df[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            elif row[col] == row[col]:
                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
        examples.append(example)
    return examples

# Converts a dataframe column into a column of 0's and 1's based on the provided test.
# Used to force label columns to be numeric for binary classification using a TF estimator.
def make_label_column_numeric(df, label_column, test):
  df[label_column] = np.where(test(df[label_column]), 1, 0)

In [None]:


import pandas as pd

# Set the path to the CSV containing the dataset to train on.
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = [
  "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
  "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
  "Hours-per-week", "Country", "Over-50K"]

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)

df

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Over-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [None]:
import numpy as np

# Set the column in the dataset you wish for the model to predict
label_column = 'Over-50K'

# Make the label column numeric (0 and 1), for use in our model.
# In this case, examples with a target value of '>50K' are considered to be in
# the '1' (positive) class and all other examples are considered to be in the
# '0' (negative) class.
make_label_column_numeric(df, label_column, lambda val: val == '>50K') #Designed for label column to have two types of categories only

# Set list of all columns from the dataset we will use for model input.
input_features = [
  'Age', 'Workclass', 'Education', 'Marital-Status', 'Occupation',
  'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss',
  'Hours-per-week', 'Country']

# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]

In [None]:

examples = df_to_examples(df)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Education-Num   32561 non-null  int64 
 5   Marital-Status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Sex             32561 non-null  object
 10  Capital-Gain    32561 non-null  int64 
 11  Capital-Loss    32561 non-null  int64 
 12  Hours-per-week  32561 non-null  int64 
 13  Country         32561 non-null  object
 14  Over-50K        32561 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [None]:

num_steps = 2000  #@param {type: "number"}

# Create a feature spec for the classifier
feature_spec = create_feature_spec(df, features_and_labels)

# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier = tf.estimator.LinearClassifier(
    feature_columns=create_feature_columns(input_features, feature_spec))
classifier.train(train_inpf, steps=num_steps)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp4fpjqsyt', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Varia

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f105b1f8550>

In [None]:
num_steps_2 = 2000  #@param {type: "number"}

classifier2 = tf.estimator.DNNClassifier(
    feature_columns=create_feature_columns(input_features, feature_spec),
    hidden_units=[128, 64, 32])
classifier2.train(train_inpf, steps=num_steps_2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpelmaz9ds', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.


<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f105b1f84a8>

##### **What-if tool on dummy dataset**
Run the commented out section at the bottom of the cell to run the tool, refer to exploration ideas below to find out more about the data

In [None]:

num_datapoints = 2000  #@param {type: "number"}
tool_height_in_px = 1000  #@param {type: "number"}

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget

# Load up the test dataset
test_csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
test_df = pd.read_csv(test_csv_path, names=csv_columns, skipinitialspace=True,
  skiprows=1)
make_label_column_numeric(test_df, label_column, lambda val: val == '>50K.')
test_examples = df_to_examples(test_df[0:num_datapoints])

# Setup the tool with the test examples and the trained classifier

# config_builder = WitConfigBuilder(test_examples[0:num_datapoints]).set_estimator_and_feature_spec(
#     classifier, feature_spec).set_compare_estimator_and_feature_spec(
#     classifier2, feature_spec).set_label_vocab(['Under 50K', 'Over 50K'])
# a = WitWidget(config_builder, height=tool_height_in_px)

#### Exploration ideas

- Organize datapoints by setting X-axis scatter to "inference score 1" and Y-axis scatter to "inference score 2" to see how each datapoint differs in score between the linear model (1) and DNN model (2). Points off the diagonal have differences in results between the two models.
  - Are there patterns of which datapoints don't agree between the two models?
  - If you set the ground truth feature dropdown in the "Performance + Fairness" tab to "Over-50K", then you can color or bin the datapoints by "inference correct 1" or "inference correct 2". Are there patterns of which datapoints are incorrect for model 1? For model 2?

- Explore performance of the two models through the confusion matrices in the "Performance + Fairness" tab. Which model is best? Train either model for longer and see if you can change this. Are the rates of errors (false positives and false negatives) that the two models make different?
  - Click the "optimize threshold" button to set the optimal positive classification threshold for each model based on the current cost ratio of 1. How do those thresholds and the resulting confusion matrices differ?
    - Change the cost ratio and optimize the threshold again. How does the threshold and performance change on the two models?
  - Slice the dataset by features, such as "sex" or "race". Does either model have more-equal performance between slices?
    - Use the threshold optimization buttons to set optimal thresholds based on the different fairness constraints. How does performance between slices differ between the two models. Does one require larger differences in threshold values per slice to achieve the desired constraint?

- Looking at the create_feature_columns function in the "Define helper methods" cell, categorical features use one-hot encodings in the model. Perhaps change a many-valued categorical feature, such as education to use an embedding layer. Does anything change in the model behavior (can look through partial dependence plots as one way to investigate).

##### **The next thing that we will be doing is trying to run the same on our dataset and try to fit a NN to it via tensorflow, but first we will try on the models given here**

In [None]:
set(data_dt.E_COM_INDICATOR.astype('category').cat.codes)

{0, 1, 2, 3, 4, 5, 6}

In [None]:
data_dt2 = data_dt.copy()
data_dt2

Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,BILLING_AMOUNT,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,TRANSACTION_AMOUNT,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_AMOUNT,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR
0,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,147.84,21-MAR-18 11.33.24,21-MAR-18 11.33.24,21-MAR-18 11.33.24,A1586941252148110,21-AUG-17 17.10.11,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,147.84,CURR356,000,145.22,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,800204,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05
1,CURR356,ATC9,TOC1,TC00,TVI0,MCC4814,NNN,UNK,399.00,21-MAR-18 11.34.15,21-MAR-18 11.34.15,21-MAR-18 11.34.15,A1586941241334010,06-APR-16 20.19.15,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,399.00,CURR356,000,391.94,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9822047692,PAYTM RETAIL -PG-ONLINE \\NOIDA\201301\,300255,RESULT000,0,AMI,ACS,T0797,AXASK05AEVV,ECI05
2,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,247.20,21-MAR-18 11.36.47,21-MAR-18 11.36.47,21-MAR-18 11.36.48,A1586941049519010,03-APR-18 14.19.03,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,247.20,CURR356,000,242.82,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700408,RESULT000,0,AMI,ACS,T0537,AXASK05AEVV,ECI05
3,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,359.00,21-MAR-18 11.39.44,21-MAR-18 11.39.44,21-MAR-18 11.39.44,A1586941050671000,20-APR-18 19.43.48,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,359.00,CURR356,000,352.65,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700585,RESULT000,0,AMI,ACS,T0796,AXASK05AEVV,ECI05
4,CURR356,ATC9,TOC1,TC00,TVI0,MCC5999,NNN,UNK,188.00,21-MAR-18 11.43.08,21-MAR-18 11.43.08,21-MAR-18 11.43.08,A1586941257961010,30-APR-16 20.49.02,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,188.00,CURR356,000,183.81,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9820524155,PAYTM.COM WALLET PG \\NOIDA\201310\\,400788,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105266,CURR356,ATCX,TOCX,TC26,TVIX,MCC6012,NNN,UNK,10.00,22-MAR-18 20.33.27,22-MAR-18 20.33.27,22-MAR-18 15.03.28,A5756981000264010,01-DEC-49 00.00.00,01-DEC-50 00.00.00,FR00,TO00,CRCX,CVCX,CCCX,PPIX,CPIX,TCRX,TCVX,TWCX,PEIX,10.00,CURR356,000,10.00,CURR356,SETTLEMENT_DATENA,CON356,RVMTTERM,100008,mVisa merchant mVisa paymentIN,062910,RESULT000,0,MOB,ITB,T0011,ECOM_INFONA,ECINA
105267,CURR356,ATC0,TOC0,TC00,TVI0,MCC5399,NNN,UNK,18000.00,22-MAR-18 20.37.09,22-MAR-18 20.37.09,22-MAR-18 20.37.09,A5357314011150000,27-JUN-17 19.40.21,01-JUN-22 00.00.00,FR00,TO98,CRC0,CVC0,CCC0,PPI0,CPI0,TCR1,TCV0,TWC0,PEI0,18000.00,CURR356,000,18000.00,CURR356,22-MAR-18 00.00.00,CON356,RVMTTERM,100008,FARHAT JHAHAN DURRANI INDORE IN,374229,RESULT000,0,MOB,ITB,T0011,ECOM_INFONA,ECINA
105268,CURR356,ATCX,TOCX,TC26,TVIX,MCC6012,NNN,UNK,18000.00,22-MAR-18 20.37.09,22-MAR-18 20.37.09,22-MAR-18 15.07.10,A5756981000264010,01-DEC-49 00.00.00,01-DEC-50 00.00.00,FR00,TO00,CRCX,CVCX,CCCX,PPIX,CPIX,TCRX,TCVX,TWCX,PEIX,18000.00,CURR356,000,18000.00,CURR356,SETTLEMENT_DATENA,CON356,RVMTTERM,100008,mVisa merchant mVisa paymentIN,062926,RESULT000,0,MOB,ITB,T0011,ECOM_INFONA,ECINA
105269,CURR356,ATC0,TOC0,TC00,TVI0,MCC4812,NNN,UNK,18100.00,22-MAR-18 20.39.09,22-MAR-18 20.39.09,22-MAR-18 20.39.09,A5357314011150000,27-JUN-17 19.40.21,01-JUN-22 00.00.00,FR00,TO98,CRC0,CVC0,CCC0,PPI0,CPI0,TCR1,TCV0,TWC0,PEI0,18100.00,CURR356,000,18100.00,CURR356,22-MAR-18 00.00.00,CON356,RVMTTERM,100008,RABBANI TELECOM INDORE IN,274349,RESULT000,0,MOB,ITB,T0011,ECOM_INFONA,ECINA


In [None]:
# So what we will be doing here is that other than the float columns we will be converting all the other columns necessarily to numeric encodings
float_cols = []
for col in data_dt2.columns:
  if data_dt2[col].dtype == 'float64':
    float_cols.append(col)

print(float_cols)
data_dt2 = data_dt2.drop(float_cols,axis=1)
data_dt2.head()
#data_dt_custom = data_dt[[x for x in data_dt.columns if x not in ['CONVERSION_RATE_DATE','CARD_ACCEPTOR_ID','AUTHORIZATION_CODE','TARGET']]]
#data_dt_custom.head()

['BILLING_AMOUNT', 'TRANSACTION_AMOUNT', 'SETTLEMENT_AMOUNT']


Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR
0,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,21-MAR-18 11.33.24,21-MAR-18 11.33.24,21-MAR-18 11.33.24,A1586941252148110,21-AUG-17 17.10.11,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,800204,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05
1,CURR356,ATC9,TOC1,TC00,TVI0,MCC4814,NNN,UNK,21-MAR-18 11.34.15,21-MAR-18 11.34.15,21-MAR-18 11.34.15,A1586941241334010,06-APR-16 20.19.15,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9822047692,PAYTM RETAIL -PG-ONLINE \\NOIDA\201301\,300255,RESULT000,0,AMI,ACS,T0797,AXASK05AEVV,ECI05
2,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,21-MAR-18 11.36.47,21-MAR-18 11.36.47,21-MAR-18 11.36.48,A1586941049519010,03-APR-18 14.19.03,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700408,RESULT000,0,AMI,ACS,T0537,AXASK05AEVV,ECI05
3,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,21-MAR-18 11.39.44,21-MAR-18 11.39.44,21-MAR-18 11.39.44,A1586941050671000,20-APR-18 19.43.48,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700585,RESULT000,0,AMI,ACS,T0796,AXASK05AEVV,ECI05
4,CURR356,ATC9,TOC1,TC00,TVI0,MCC5999,NNN,UNK,21-MAR-18 11.43.08,21-MAR-18 11.43.08,21-MAR-18 11.43.08,A1586941257961010,30-APR-16 20.49.02,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9820524155,PAYTM.COM WALLET PG \\NOIDA\201310\\,400788,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05


In [None]:
y = data_dt2.E_COM_INDICATOR

In [None]:
# Categorical columns
cat_columns = [col for col in data_dt2.columns if col != 'E_COM_INDICATOR']
cat_columns

['TRANSACTION_CURRENCY',
 'PD4',
 'PD11',
 'PROC_CODE_12',
 'PD8',
 'MCC',
 'POS_ENTRY_MODE',
 'SERVICE_CODE',
 'TRANSACTION_LOCAL_DATE',
 'TRANSMISSION_DATE_TIME',
 'RESPONSE_DATE_TIME',
 'PAN',
 'VALID_FROM',
 'EXPIRY_DATE',
 'PROC_CODE_34',
 'PROC_CODE_56',
 'PD1',
 'PD2',
 'PD3',
 'PD5',
 'PD6',
 'PD7',
 'PD9',
 'PD10',
 'PD12',
 'BILLING_CURRENCY',
 'CONVERSION_RATE_DATE',
 'SETTLEMENT_CURRENCY',
 'SETTLEMENT_DATE',
 'ACQUIRING_COUNTRY_CODE',
 'CARD_ACCEPTOR_TERM_ID',
 'CARD_ACCEPTOR_ID',
 'CARD_ACC_NAME_ADDRESS',
 'AUTHORIZATION_CODE',
 'DECISION',
 'TARGET',
 'C_100',
 'C_40',
 'C_20',
 'ECOM_INFO']

In [None]:
# Coverting the rest to strings and one hot encoding them
# data_dt_custom = pd.DataFrame()
# encoder = OneHotEncoder(sparse=False)
# for col in data_dt2:
#   if col != 'E_COM_INDICATOR':
#     data_temp = data_dt2[[col]]
#     encoder.fit(data_temp.astype('str'))
#     temp = encoder.transform(data_temp.astype('str'))
#     temp = pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data_dt2[col].value_counts().index])
#     temp = temp.set_index(data_dt2.index.values)
#     data_dt_custom = pd.concat([data_dt_custom,temp],axis=1)

# data_dt_custom = pd.concat([data_dt_custom,data_dt[float_cols]],axis=1)
# data_dt_custom.head()

In [None]:
data_dt_custom = data_dt2.copy()
for col in float_cols:
  data_dt_custom[col] = data_dt[col]
input_columns = [x for x in data_dt_custom.columns if x != 'E_COM_INDICATOR']
input_columns

['TRANSACTION_CURRENCY',
 'PD4',
 'PD11',
 'PROC_CODE_12',
 'PD8',
 'MCC',
 'POS_ENTRY_MODE',
 'SERVICE_CODE',
 'TRANSACTION_LOCAL_DATE',
 'TRANSMISSION_DATE_TIME',
 'RESPONSE_DATE_TIME',
 'PAN',
 'VALID_FROM',
 'EXPIRY_DATE',
 'PROC_CODE_34',
 'PROC_CODE_56',
 'PD1',
 'PD2',
 'PD3',
 'PD5',
 'PD6',
 'PD7',
 'PD9',
 'PD10',
 'PD12',
 'BILLING_CURRENCY',
 'CONVERSION_RATE_DATE',
 'SETTLEMENT_CURRENCY',
 'SETTLEMENT_DATE',
 'ACQUIRING_COUNTRY_CODE',
 'CARD_ACCEPTOR_TERM_ID',
 'CARD_ACCEPTOR_ID',
 'CARD_ACC_NAME_ADDRESS',
 'AUTHORIZATION_CODE',
 'DECISION',
 'TARGET',
 'C_100',
 'C_40',
 'C_20',
 'ECOM_INFO',
 'BILLING_AMOUNT',
 'TRANSACTION_AMOUNT',
 'SETTLEMENT_AMOUNT']

In [None]:
len(input_columns)

43

In [None]:
data_dt_custom['E_COM_INDICATOR'].head()

0    ECI05
1    ECI05
2    ECI05
3    ECI05
4    ECI05
Name: E_COM_INDICATOR, dtype: object

In [None]:
from sklearn.preprocessing import LabelEncoder

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
num_target = labelencoder.fit_transform(data_dt_custom['E_COM_INDICATOR'])


In [None]:
data_dt_custom['E_COM_INDICATOR'] = num_target

In [None]:
data_dt_custom['E_COM_INDICATOR'].head()

0    3
1    3
2    3
3    3
4    3
Name: E_COM_INDICATOR, dtype: int64

In [None]:
# Setting the dataframe to be our dataset now
df = data_dt_custom
# Set the column in the dataset you wish for the model to predict
label_column = 'E_COM_INDICATOR'

# Make the label column numeric (0 and 1), for use in our model.
# In this case, examples with a target value of '>50K' are considered to be in
# the '1' (positive) class and all other examples are considered to be in the
# '0' (negative) class.
#make_label_column_numeric(df, label_column, lambda val: val == '>50K') #Designed for label column to have two types of categories only

#data_dt2[label_column] = df.E_COM_INDICATOR.astype('category').cat.codes

# Set list of all columns from the dataset we will use for model input.
input_features = [x for x in df.columns if x != 'E_COM_INDICATOR']

# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]

In [None]:

# Takes some time, please be patient
examples = df_to_examples(data_dt_custom)

In [None]:
num_steps = 2000  #@param {type: "number"}

# Create a feature spec for the classifier
feature_spec = create_feature_spec(data_dt_custom, features_and_labels)

# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)


In [None]:
# Takes some time, please be patient
classifier = tf.estimator.LinearClassifier(
    feature_columns=create_feature_columns(input_features, feature_spec),n_classes=7)
# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier.train(train_inpf, steps=num_steps)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7ubpxdqb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Varia

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f699aa8f4e0>

In [None]:
num_steps_2 = 2000  #@param {type: "number"}

# Please be patient, takes some time
classifier2 = tf.estimator.DNNClassifier(
    feature_columns=create_feature_columns(input_features, feature_spec),n_classes=7,
    hidden_units=[128, 64, 32])
# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier2.train(train_inpf, steps=num_steps_2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmptprafu7_', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.


<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f698d8b2668>

###### **Putting it through a customized neural network and comparing**
DNN Classifier is also a feed forward neural network where hidden_units gives the number of nodes in each of the respective hidden layers in order.

In [None]:
# classifier2 = tf.estimator.DNNClassifier(
#     feature_columns=create_feature_columns(input_features, feature_spec),n_classes=7,
#     hidden_units=[128, 64, 32])

#Takes time (around one hour), please be patient, to reduce time reduce num_steps_3
num_steps_3 =2000
classifier3 = tf.estimator.DNNClassifier(
    hidden_units=[512], feature_columns=create_feature_columns(input_features, feature_spec),n_classes=7, optimizer='Adam', activation_fn=tf.nn.relu
    )
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier3.train(train_inpf, steps=num_steps_3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp494v85sk', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.


<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f698d7be160>

##### Run commented out portion in two cells below to see the comparison in what-if tool between classifiers 1 and 3 and classifier 1 and 2 respectively.

In [None]:
num_datapoints = 2000  
tool_height_in_px = 1000  

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget



# Setup the tool with the test examples and the trained classifier

# config_builder = WitConfigBuilder(examples[0:num_datapoints]).set_estimator_and_feature_spec(
#     classifier, feature_spec).set_compare_estimator_and_feature_spec(
#     classifier3, feature_spec)
# a = WitWidget(config_builder, height=tool_height_in_px)

In [None]:
num_datapoints = 2000  
tool_height_in_px = 1000  

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget



# Setup the tool with the test examples and the trained classifier

# config_builder = WitConfigBuilder(examples[0:num_datapoints]).set_estimator_and_feature_spec(
#     classifier, feature_spec).set_compare_estimator_and_feature_spec(
#     classifier2, feature_spec)
# a = WitWidget(config_builder, height=tool_height_in_px)

##### **Putting it through any keras defined neural network model**
https://www.tensorflow.org/tutorials/load_data/pandas_dataframe
<br> look at following link.

In [None]:
data_dt2.head()

Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR
0,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,21-MAR-18 11.33.24,21-MAR-18 11.33.24,21-MAR-18 11.33.24,A1586941252148110,21-AUG-17 17.10.11,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,800204,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05
1,CURR356,ATC9,TOC1,TC00,TVI0,MCC4814,NNN,UNK,21-MAR-18 11.34.15,21-MAR-18 11.34.15,21-MAR-18 11.34.15,A1586941241334010,06-APR-16 20.19.15,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9822047692,PAYTM RETAIL -PG-ONLINE \\NOIDA\201301\,300255,RESULT000,0,AMI,ACS,T0797,AXASK05AEVV,ECI05
2,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,21-MAR-18 11.36.47,21-MAR-18 11.36.47,21-MAR-18 11.36.48,A1586941049519010,03-APR-18 14.19.03,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700408,RESULT000,0,AMI,ACS,T0537,AXASK05AEVV,ECI05
3,CURR356,ATC9,TOC1,TC00,TVI0,MCC7399,NNN,UNK,21-MAR-18 11.39.44,21-MAR-18 11.39.44,21-MAR-18 11.39.44,A1586941050671000,20-APR-18 19.43.48,01-APR-23 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9826826968,BOOK MY SHOW\\\400049\\,700585,RESULT000,0,AMI,ACS,T0796,AXASK05AEVV,ECI05
4,CURR356,ATC9,TOC1,TC00,TVI0,MCC5999,NNN,UNK,21-MAR-18 11.43.08,21-MAR-18 11.43.08,21-MAR-18 11.43.08,A1586941257961010,30-APR-16 20.49.02,01-APR-20 00.00.00,FR40,TO00,CRC1,CVC0,CCC0,PPI4,CPI0,TCR2,TCV0,TWC0,PEI0,CURR356,0,CURR356,22-MAR-18 00.00.00,CON356,6R00G052,9820524155,PAYTM.COM WALLET PG \\NOIDA\201310\\,400788,RESULT000,0,AMI,ACS,T0784,AXASK05AEVV,ECI05


##### **Label Encoding of the categorical columns and the keeping the float columns as is**

In [None]:
data_dt_custom2 = pd.DataFrame()
for col in data_dt2.columns:
  le = LabelEncoder()
  le.fit(data_dt2[col])
  data_dt_custom2[col] = le.transform(data_dt2[col])

for col in float_cols:
  data_dt_custom2[col] = data_dt[col]

data_dt_custom2.head()

Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR,BILLING_AMOUNT,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT
0,14,5,1,0,0,258,14,6,9,0,0,934,28389,1,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27991,5564,70860,0,0,0,0,594,21702,3,147.84,147.84,145.22
1,14,5,1,0,0,100,14,6,10,1,1,875,7515,1,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27698,28286,16588,0,0,0,0,602,21702,3,399.0,399.0,391.94
2,14,5,1,0,0,258,14,6,11,2,2,492,2951,4,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27991,5564,59898,0,0,0,0,419,21702,3,247.2,247.2,242.82
3,14,5,1,0,0,258,14,6,12,3,3,529,26953,4,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27991,5564,59907,0,0,0,0,601,21702,3,359.0,359.0,352.65
4,14,5,1,0,0,217,14,6,15,6,5,961,43447,1,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27548,28293,27344,0,0,0,0,594,21702,3,188.0,188.0,183.81


In [None]:
data_dt_custom2.shape

(105271, 44)

In [None]:
data_dt_custom2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105271 entries, 0 to 105270
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   TRANSACTION_CURRENCY    105271 non-null  int64  
 1   PD4                     105271 non-null  int64  
 2   PD11                    105271 non-null  int64  
 3   PROC_CODE_12            105271 non-null  int64  
 4   PD8                     105271 non-null  int64  
 5   MCC                     105271 non-null  int64  
 6   POS_ENTRY_MODE          105271 non-null  int64  
 7   SERVICE_CODE            105271 non-null  int64  
 8   TRANSACTION_LOCAL_DATE  105271 non-null  int64  
 9   TRANSMISSION_DATE_TIME  105271 non-null  int64  
 10  RESPONSE_DATE_TIME      105271 non-null  int64  
 11  PAN                     105271 non-null  int64  
 12  VALID_FROM              105271 non-null  int64  
 13  EXPIRY_DATE             105271 non-null  int64  
 14  PROC_CODE_34        

In [None]:
data_dt_custom2.head()

Unnamed: 0,TRANSACTION_CURRENCY,PD4,PD11,PROC_CODE_12,PD8,MCC,POS_ENTRY_MODE,SERVICE_CODE,TRANSACTION_LOCAL_DATE,TRANSMISSION_DATE_TIME,RESPONSE_DATE_TIME,PAN,VALID_FROM,EXPIRY_DATE,PROC_CODE_34,PROC_CODE_56,PD1,PD2,PD3,PD5,PD6,PD7,PD9,PD10,PD12,BILLING_CURRENCY,CONVERSION_RATE_DATE,SETTLEMENT_CURRENCY,SETTLEMENT_DATE,ACQUIRING_COUNTRY_CODE,CARD_ACCEPTOR_TERM_ID,CARD_ACCEPTOR_ID,CARD_ACC_NAME_ADDRESS,AUTHORIZATION_CODE,DECISION,TARGET,C_100,C_40,C_20,ECOM_INFO,E_COM_INDICATOR,BILLING_AMOUNT,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT
0,14,5,1,0,0,258,14,6,9,0,0,934,28389,1,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27991,5564,70860,0,0,0,0,594,21702,3,147.84,147.84,145.22
1,14,5,1,0,0,100,14,6,10,1,1,875,7515,1,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27698,28286,16588,0,0,0,0,602,21702,3,399.0,399.0,391.94
2,14,5,1,0,0,258,14,6,11,2,2,492,2951,4,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27991,5564,59898,0,0,0,0,419,21702,3,247.2,247.2,242.82
3,14,5,1,0,0,258,14,6,12,3,3,529,26953,4,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27991,5564,59907,0,0,0,0,601,21702,3,359.0,359.0,352.65
4,14,5,1,0,0,217,14,6,15,6,5,961,43447,1,4,0,1,0,0,4,0,1,0,0,0,1,0,1,1,29,40758,27548,28293,27344,0,0,0,0,594,21702,3,188.0,188.0,183.81


In [None]:
target = data_dt_custom2.pop('E_COM_INDICATOR')


#### **Method 1**

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((data_dt_custom2.values, target.values))


In [None]:
for feat, targ in dataset.take(1):
  print(type(feat))
  print(type(targ))

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [None]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))


Features: [1.4000e+01 5.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 2.5800e+02
 1.4000e+01 6.0000e+00 9.0000e+00 0.0000e+00 0.0000e+00 9.3400e+02
 2.8389e+04 1.0000e+00 4.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00
 0.0000e+00 4.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 1.0000e+00 0.0000e+00 1.0000e+00 1.0000e+00 2.9000e+01
 4.0758e+04 2.7991e+04 5.5640e+03 7.0860e+04 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 5.9400e+02 2.1702e+04 1.4784e+02 1.4784e+02
 1.4522e+02], Target: 3
Features: [1.4000e+01 5.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+02
 1.4000e+01 6.0000e+00 1.0000e+01 1.0000e+00 1.0000e+00 8.7500e+02
 7.5150e+03 1.0000e+00 4.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00
 0.0000e+00 4.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 1.0000e+00 0.0000e+00 1.0000e+00 1.0000e+00 2.9000e+01
 4.0758e+04 2.7698e+04 2.8286e+04 1.6588e+04 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 6.0200e+02 2.1702e+04 3.9900e+02 3.9900e+02
 3.9194e+02], Targ

In [None]:
train_dataset = dataset.shuffle(len(data_dt_custom2)).batch(32)


In [None]:
def get_compiled_model():
  model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(43,)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(7, activation='softmax')
  ])

  model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
  return model

In [None]:
model = get_compiled_model()
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 43)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               22528     
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 3591      
Total params: 26,119
Trainable params: 26,119
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_dataset, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f69838e0f98>

##### Run commented out portion in cell below to see what if tool visualization

In [None]:
num_datapoints = 2000  
tool_height_in_px = 1000  

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget



# Setup the tool with the test examples and the trained classifier

# config_builder = WitConfigBuilder(examples[0:num_datapoints]).set_estimator_and_feature_spec(
#     model, feature_spec)
# a = WitWidget(config_builder, height=tool_height_in_px)

#### **Method 2**

In [None]:
inputs = {key: tf.keras.layers.Input(shape=(), name=key) for key in data_dt_custom2.keys()}
x = tf.stack(list(inputs.values()), axis=-1)

x = tf.keras.layers.Dense(512, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(7, activation = 'softmax')(x)

model_func = tf.keras.Model(inputs=inputs, outputs=output)

model_func.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])



In [None]:
dict_slices = tf.data.Dataset.from_tensor_slices((data_dt_custom2.to_dict('series'), target.values)).batch(32)
for dict_slice in dict_slices.take(1):
  print (dict_slice)


({'TRANSACTION_CURRENCY': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])>, 'PD4': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5])>, 'PD11': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1])>, 'PROC_CODE_12': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>, 'PD8': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>, 'MCC': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([258, 100, 258, 258, 217, 217, 258, 217, 100, 258, 217, 

In [None]:
model_func.fit(dict_slices, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f232d0d3668>

##### Run commented out portion in cell below to see what if tool visualization

In [None]:
num_datapoints = 2000  
tool_height_in_px = 1000  

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget



# Setup the tool with the test examples and the trained classifier

# config_builder = WitConfigBuilder(examples[0:num_datapoints]).set_estimator_and_feature_spec(
#     model_func, feature_spec)
# a = WitWidget(config_builder, height=tool_height_in_px)

In [None]:
feature_spec

{'ACQUIRING_COUNTRY_CODE': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'AUTHORIZATION_CODE': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'BILLING_AMOUNT': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'BILLING_CURRENCY': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'CARD_ACCEPTOR_ID': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'CARD_ACCEPTOR_TERM_ID': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'CARD_ACC_NAME_ADDRESS': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'CONVERSION_RATE_DATE': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'C_100': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'C_20': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'C_40': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'DECISION': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 '

In [None]:
type(examples[0])

tensorflow.core.example.example_pb2.Example

### **ROUGH WORK**

In [None]:
label_column = 'E_COM_INDICATOR'
input_columns = [x for x in data_dt_custom2.columns if x != 'E_COM_INDICATOR']
input_columns

['TRANSACTION_CURRENCY',
 'PD4',
 'PD11',
 'PROC_CODE_12',
 'PD8',
 'MCC',
 'POS_ENTRY_MODE',
 'SERVICE_CODE',
 'TRANSACTION_LOCAL_DATE',
 'TRANSMISSION_DATE_TIME',
 'RESPONSE_DATE_TIME',
 'PAN',
 'VALID_FROM',
 'EXPIRY_DATE',
 'PROC_CODE_34',
 'PROC_CODE_56',
 'PD1',
 'PD2',
 'PD3',
 'PD5',
 'PD6',
 'PD7',
 'PD9',
 'PD10',
 'PD12',
 'BILLING_CURRENCY',
 'CONVERSION_RATE_DATE',
 'SETTLEMENT_CURRENCY',
 'SETTLEMENT_DATE',
 'ACQUIRING_COUNTRY_CODE',
 'CARD_ACCEPTOR_TERM_ID',
 'CARD_ACCEPTOR_ID',
 'CARD_ACC_NAME_ADDRESS',
 'AUTHORIZATION_CODE',
 'DECISION',
 'TARGET',
 'C_100',
 'C_40',
 'C_20',
 'ECOM_INFO',
 'BILLING_AMOUNT',
 'TRANSACTION_AMOUNT',
 'SETTLEMENT_AMOUNT']

In [None]:
import datetime

# Dividing the data into train_test_split
X = np.asarray(data_dt_custom[input_columns])
y = np.asarray(data_dt_custom[label_column])
#y = np.array(data_dt_final.loc[:,'TARGET'])
print(X.shape)
print(y.shape)

(105271, 43)
(105271,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, shuffle=True)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(94743, 43)
(10528, 43)
(94743,)
(10528,)


In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(43,)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(7, activation='softmax')
  ])

In [None]:
#model = create_model()
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 43)                0         
_________________________________________________________________
dense (Dense)                (None, 512)               22528     
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 3591      
Total params: 26,119
Trainable params: 26,119
Non-trainable params: 0
_________________________________________________________________


In [None]:
import tempfile
model_dir = tempfile.mkdtemp()
keras_estimator = tf.keras.estimator.model_to_estimator(
    keras_model=model, model_dir=model_dir)


In [None]:
keras_estimator.train(input_fn)


TypeError: ignored

In [None]:
set(data['E_COM_INDICATOR'])

In [None]:
label_col = 'E_COM_INDICATOR'

In [None]:
cols = [col for col in data.columns if col not in [label_col]]
X = data[cols]
y = data[label_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
sns.countplot(data['E_COM_INDICATOR'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20,mode="auto")
Regularizer = l2(0.001)

##### **Fitting of InceptionV3**

In [None]:
IMG_HEIGHT = 299
IMG_WIDTH = 299
incep_v3 = InceptionV3(include_top=False, weights='imagenet', input_shape=(IMG_HEIGHT,IMG_WIDTH,3))
incep_v3.trainable = False

output = incep_v3.output
output = Flatten()(output)
output = Dense(100,activation='relu',activity_regularizer=Regularizer, kernel_regularizer=Regularizer)(output)
output = Dense(100,activation='relu',activity_regularizer=Regularizer, kernel_regularizer=Regularizer)(output)
incep_v3_out = Dense(15,activation='softmax',activity_regularizer=Regularizer, kernel_regularizer=Regularizer)(output)

model_v3 = Model(inputs=incep_v3.input,outputs=incep_v3_out)

# for layer in resnet.layers:
#     layer.trainable = False
model_v3.summary()

In [None]:
#resnet.trainable = False
# model1 = Sequential()
# model1.add(resnet)
# model1.add(Dense(15, activation='softmax',activity_regularizer=Regularizer, kernel_regularizer=Regularizer))

INIT_LR = 1e-6
BS = 32
#NUM_EPOCHS = 10
#opt = SGD(lr=INIT_LR, momentum=0.9, decay=INIT_LR / NUM_EPOCHS)
opt = Adam(learning_rate=INIT_LR) #, beta_1=0.9, beta_2=0.999, amsgrad=False)
model_v3.compile(loss="categorical_crossentropy", optimizer=opt,
	metrics=["accuracy"])

# print(model1.summary())

In [None]:
import graphviz 
#from tensorflow.keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import  plot_model
plot_model(model_v3, show_shapes=True, show_layer_names=True)