In [None]:
from numpy.random import seed
seed(1)

import tensorflow as tf
tf.random.set_seed(2)

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

from tensorboard.plugins.hparams import api as hp

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
import pandas as pd

train_data = pd.read_csv('BASE-PREPROCESSED(TRAIN).gz', sep="\t")
validation_data = pd.read_csv('BASE-PREPROCESSED(VALIDACAO).gz', sep="\t")
test_data = pd.read_csv('BASE-PREPROCESSED(TESTE).gz', sep="\t")

In [None]:
def parameter_nomalization(data):
  class_min = data.loc[data['ALVO'] == 0]
  times = int((data.shape[0] - class_min.shape[0]) / class_min.shape[0])
  for i in range(times):
      data = pd.concat([data, class_min], ignore_index=False, verify_integrity=False, sort=False)

  return data

train_data = parameter_nomalization(train_data)
validation_data = parameter_nomalization(validation_data)
test_data = parameter_nomalization(test_data)

In [None]:
train_data = train_data.drop(['PROPHET_NORM_FEATURES'], axis=1)
validation_data = validation_data.drop(['PROPHET_NORM_FEATURES'], axis=1)
test_data = test_data.drop(['PROPHET_NORM_FEATURES'], axis=1)

In [None]:
train_data = train_data.drop(['CLASSE_SOCIAL_PCNC_PCM_rv_1', 'CLASSE_SOCIAL_PCNC_PCM_rv_0', 'CLASSE_SOCIAL_PCNC_PCM_rv_2'], axis=1)
y_train = train_data['ALVO']
x_train = train_data.drop(['ALVO', 'PROPHET_LABEL', 'NEURO_LABEL'], axis=1)
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

y_val = validation_data['ALVO']
x_val = validation_data.drop(['ALVO', 'PROPHET_LABEL', 'NEURO_LABEL'], axis=1)
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

y_test = test_data['ALVO']
x_test = test_data.drop(['ALVO', 'PROPHET_LABEL', 'NEURO_LABEL'], axis=1)
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
from keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score

from scipy.stats import ks_2samp

In [None]:
HP_NUM_UNITS1 = hp.HParam('num_units 1', hp.Discrete([32,64,128,256]))
HP_NUM_UNITS2 = hp.HParam('num_units 2', hp.Discrete([32,64,128,256]))
HP_NUM_UNITS3 = hp.HParam('num_units 3', hp.Discrete([32,64,128,256]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam','RMSprop']))
HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['relu', 'tanh']))
HP_L2 = hp.HParam('l2 regularizer', hp.RealInterval(.001,.01))
METRIC_ACCURACY = 'accuracy'
METRIC_RECALL = 'recall'
METRIC_PRECISION = 'precision'
METRIC_F1 = 'f1'
METRIC_AUROC = 'auroc'
METRIC_AUPR = 'aupr'
METRIC_KS = 'ks'

with tf.summary.create_file_writer('logs/results').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS1,HP_NUM_UNITS2, HP_NUM_UNITS3, HP_L2 ,HP_OPTIMIZER, HP_ACTIVATION],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

In [None]:
def aux(s, g):
    if s in g.index:
        return g[s]
    else:
        return 0

def kstest(x, y):
    threshold = 0
    df = pd.DataFrame({'ALVO': y, 'x': x})
    n0, n1 = df['ALVO'].value_counts().values
    data0, data1 = [], []
    while threshold < 1.001:
        g = df[df['x'] <= threshold]['ALVO'].value_counts()
        data0.append(g.get(0, 0) / n0)
        data1.append(g.get(1, 0) / n1)
        threshold += 0.0001
    return data0, data1

def compute_performance_metrics(y, y_pred_class, y_pred_scores=None):
    accuracy = accuracy_score(y, y_pred_class)
    recall = recall_score(y, y_pred_class)
    precision = precision_score(y, y_pred_class)
    f1 = f1_score(y, y_pred_class)
    performance_metrics = (accuracy, recall, precision, f1)
    if y_pred_scores is not None:
        auroc = roc_auc_score(y, y_pred_scores)
        aupr = average_precision_score(y, y_pred_scores)
        data0, data1 = kstest(y_pred_scores,y)
        ks = ks_2samp(data0,data1)
        performance_metrics = performance_metrics + (auroc, aupr, ks)
    return performance_metrics

In [None]:
def train_test_model(hparams):
  model = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(hparams[HP_NUM_UNITS1], kernel_regularizer=tf.keras.regularizers.l2(0.001), activation=hparams[HP_ACTIVATION]),
    tf.keras.layers.Dense(hparams[HP_NUM_UNITS2], kernel_regularizer=tf.keras.regularizers.l2(0.001), activation=hparams[HP_ACTIVATION]),
    tf.keras.layers.Dense(hparams[HP_NUM_UNITS3], kernel_regularizer=tf.keras.regularizers.l2(0.001), activation=hparams[HP_ACTIVATION]),
    tf.keras.layers.Dense(1, activation=hparams[HP_ACTIVATION])
  ])

  model.compile(optimizer=hparams[HP_OPTIMIZER],
                loss='mean_squared_error',
                metrics=['accuracy'])

  es = EarlyStopping(monitor='val_accuracy', mode='auto', verbose=1, min_delta=0.001, patience=10)

  print("Training...")
  model.fit(x_train,
            y_train,
            validation_data=(x_val, y_val),
            epochs=500,
            batch_size=131072,
            callbacks=[es])

  print("Evaluating...")
  y_pred_scores = model.predict(x_test)
  y_pred_class = model.predict_classes(x_test, verbose=0)
  accuracy, recall, precision, f1, auroc, aupr, ks = compute_performance_metrics(y_test, y_pred_class, y_pred_scores)
  
  print("Done!")

  return accuracy, recall, precision, f1, auroc, aupr, ks[0]

In [None]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)
    accuracy, recall, precision, f1, auroc, aupr, ks = train_test_model(hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
    tf.summary.scalar(METRIC_RECALL, recall, step=1)
    tf.summary.scalar(METRIC_PRECISION, precision, step=1)
    tf.summary.scalar(METRIC_F1, f1, step=1)
    tf.summary.scalar(METRIC_AUROC, auroc, step=1)
    tf.summary.scalar(METRIC_AUPR, aupr, step=1)
    tf.summary.scalar(METRIC_KS, ks, step=1)

In [None]:
session_num = 0
for num_units1 in HP_NUM_UNITS1.domain.values:
  for num_units2 in HP_NUM_UNITS2.domain.values:
    for num_units3 in HP_NUM_UNITS3.domain.values:
      for activation in (HP_ACTIVATION.domain.values):
        for l2 in (HP_L2.domain.min_value, HP_L2.domain.max_value):
          for optimizer in HP_OPTIMIZER.domain.values:
            hparams = {
                HP_NUM_UNITS1: num_units1,
                HP_NUM_UNITS2: num_units2,
                HP_NUM_UNITS3: num_units3,
                HP_ACTIVATION: activation,
                HP_L2: l2,
                HP_OPTIMIZER: optimizer
            }
            run_name = "run-%d" % session_num

            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})

            run('logs/results/' + run_name, hparams)
            session_num += 1