In [1]:
import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [76]:
# test_data = pd.read_csv('../DATA/test_normalized_4labels.csv', index_col=0)
# training_data = pd.read_csv(
#     '../DATA/train_normalized_4labels.csv', index_col=0)

test_data = pd.read_csv('../DATA/test_downsampling.csv', index_col=0)
training_data = pd.read_csv(
    '../DATA/training_downsampling.csv', index_col=0)


In [77]:
np.unique(test_data['CASE_STATUS'])

array(['CERTIFIED', 'CERTIFIED-WITHDRAWN', 'DENIED', 'WITHDRAWN'],
      dtype=object)

In [78]:
labels = {'CERTIFIED': 0, 'CERTIFIED-WITHDRAWN': 1, 'DENIED': 2, 'WITHDRAWN':3}
test_data['CASE_STATUS'] = test_data['CASE_STATUS'].apply(lambda x: labels[x])
training_data['CASE_STATUS'] = training_data['CASE_STATUS'].apply(lambda x: labels[x])

In [79]:
train_data, val_data = train_test_split(training_data, test_size=0.2)

In [80]:
train_labels = np.array(train_data.pop('CASE_STATUS'))
val_labels = np.array(val_data.pop('CASE_STATUS'))
test_labels = np.array(test_data.pop('CASE_STATUS'))

train_data.pop('EMPLOYER_NAME')
test_data.pop('EMPLOYER_NAME')
val_data.pop('EMPLOYER_NAME')

train_features = np.array(train_data)
val_features = np.array(val_data)
test_features = np.array(test_data)



In [81]:
transfer_dict = {0: [1,0,0,0],  1:[0,1,0,0], 2: [0,0,1,0], 3:[0,0,0,1]}
train_labels = np.array([transfer_dict[i] for i in train_labels] )
test_labels = np.array([transfer_dict[i] for i in test_labels] )
val_labels = np.array([transfer_dict[i] for i in val_labels])


In [82]:
print(train_features.shape)
print(test_features.shape)
print(val_features.shape)
print(train_labels.shape)
print(test_labels.shape)
print(val_labels.shape)


(79193, 420)
(730531, 420)
(19799, 420)
(79193, 4)
(730531, 4)
(19799, 4)


In [114]:
METRICS = [
    keras.metrics.mean_squared_error,
    keras.metrics.mean_absolute_error,
    keras.metrics.mean_absolute_percentage_error,
    keras.metrics.categorical_accuracy,
]


def make_model(metrics=METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          256, activation='relu',
          input_shape=(train_features.shape[-1],)),
      keras.layers.Dense(
          64, activation='relu'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(4, activation='softmax',
                         bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.CategoricalHinge(),
      metrics=metrics)

  return model


In [117]:
EPOCHS = 100
BATCH_SIZE = 2048

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_categorical_accuracy',
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)
model = make_model()
model.summary()


Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 256)               107776    
_________________________________________________________________
dense_43 (Dense)             (None, 64)                16448     
_________________________________________________________________
dropout_14 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_44 (Dense)             (None, 4)                 260       
Total params: 124,484
Trainable params: 124,484
Non-trainable params: 0
_________________________________________________________________


In [118]:
model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(val_features, val_labels),
    callbacks=[early_stopping],
    verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb32a0f8490>

In [119]:
prediction = model.predict(test_data)

In [120]:
prediction

array([[1.38155569e-03, 2.40176490e-07, 9.98541951e-01, 7.62843483e-05],
       [9.98576999e-01, 2.81030632e-04, 1.13979576e-03, 2.10587609e-06],
       [9.92561638e-01, 7.43769063e-03, 5.86193551e-07, 1.23971134e-07],
       ...,
       [1.00000000e+00, 2.10931326e-08, 1.30545404e-17, 2.65401522e-16],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.93461846e-09, 1.27565313e-14, 1.33639269e-08, 1.00000000e+00]],
      dtype=float32)

In [121]:
np.mean(keras.metrics.categorical_accuracy(test_labels,prediction).numpy())

0.85476863

In [122]:
prediction = np.array([list(i).index(max(i)) for i in prediction])


In [123]:
truelabels = np.array([list(i).index(max(i)) for i in test_labels])


In [124]:
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score
f1_score(truelabels, prediction, average=None)


array([0.91555631, 0.90782821, 0.19896587, 0.5147419 ])