In [None]:
import os
import random
import math
import h5py

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedShuffleSplit
import scikitplot as skplt
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import pickle
import bert
#import keras
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.python.framework import ops
from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
from tensorflow.python.ops import clip_ops

import Utils.data_utils as data

In [None]:
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
def createTokenizer(model_path):
    """This function aims to create a tokenizer specific to the format of BERT models. 
    BERT models contain different files, one of which is vocab.txt
    - vocab: vocab.txt path"""
    return bert.bert_tokenization.FullTokenizer(model_path+'/vocab.txt', do_lower_case=True)
    
def format_text(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    
    if len(tokens) > max_seq_len:
        tokens = tokens[len(tokens)-max_seq_len:]
        
    input_sequence = ["[CLS]"]+tokens[:max_seq_len-2]+["[SEP]"]
    pad_len = max_seq_len-len(input_sequence)
    return np.asarray(tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len).astype('int32')

def StratifiedBatches(s, batch_size):
    X_train, y_train = s
    idx = np.random.permutation(len(X_train))
    X_train, y_train = X_train[idx], y_train[idx]
    
    total_index = [i for i in range(0, len(X_train))]
    #lab_index = np.where(np.any(y_train, axis=-1))[-1]
    #unl_index = np.delete(total_index, lab_index, axis=0)
    lab_index = np.where(y_train[:,0]==0)[-1]
    unl_index = np.where(y_train[:,0]==1)[-1]

    n_splits = int(np.ceil(X_train.shape[0]/batch_size))
    
    splits_index = [np.array_split(split, n_splits) for split in [lab_index, unl_index]]
    
    x = [0]*n_splits
    y = [0]*n_splits
    for i in range(0, n_splits):
        x[i] = [X_train[x] for x in np.concatenate((splits_index[0][i], splits_index[1][i]), axis=0)]
        y[i] = [y_train[x] for x in np.concatenate((splits_index[0][i], splits_index[1][i]), axis=0)]
        idx = np.random.permutation(len(x[i]))
        x[i], y[i] = np.array(x[i])[idx], np.array(y[i])[idx]
        
    return x, y

def get_label_mask(y_true):
    return 0.5 > y_true[:,0]

def metric_mask_filter(y_true, y_pred):
    mask = get_label_mask(y_true)
    y_true = tf.boolean_mask(y_true, mask)
    y_pred = tf.boolean_mask(y_pred, mask)
    return y_true[:,1:], y_pred[:,1:]

def save_processed(t_save_path, s_save_path, split=False):
    """
    t Order: Xtrain, ytrain, Xdev, ydeb, Xtest, ytest
    s Order: X_train, y_train, X_unl, y_unl
    """
    with open(t_save_path,'wb') as f: pickle.dump([Xtrain, ytrain, Xdev, ydev, Xtest, ytest], f)
    if split:
        with open(s_save_path,'wb') as f: pickle.dump([X_train, y_train, X_unl, y_unl], f)

def load_processed(t_save_path, s_save_path, split=False):
    """
    t Order: Xtrain, ytrain, Xdev, ydeb, Xtest, ytest
    s Order: X_train, y_train, X_unl, y_unl
    """
    with open(t_save_path,'rb') as f: t = pickle.load(f)
    if split:
        with open(s_save_path,'rb') as f: s = pickle.load(f)
        return t, s
    return t

In [None]:
epsilon = 1e-8
LATENT_Z = 100
max_seq_len = 250
path_bert_model = './BERT-models/BERT-Mini'
# path_bert_model = './BERT-models/BERT-Mini'

path_train = 'dataset/Raw/3c_Train.csv'
path_test = 'dataset/Raw/3c_Test.csv'

dataset = data.Dataset()
dataset.load_csv(path_train, path_test, label_name='blabel', separator='\t')
dataset.make_dev_split(dev_split=0.15)
fig = dataset.classes_distribution()

## Load processed Data

In [None]:
t, s = load_processed("dataset/splitedData/3c_BERT_processed_Data/5percent/Total", \
               "dataset/splitedData/3c_BERT_processed_Data/5percent/Split", split=True)
Xtrain, ytrain, Xdev, ydev, Xtest, ytest = t
X_train, y_train, X_unl, y_unl = s

## Data preprocessing (Only if processed data is not available)

In [None]:
dataset.get_train().review = dataset.get_train().review.replace('"','', regex=True)
dataset.get_dev().review = dataset.get_dev().review.replace('"','', regex=True)
dataset.get_test().review = dataset.get_test().review.replace('"','', regex=True)

tokenizer = createTokenizer(path_bert_model)

Xtrain = np.asarray([format_text(text, tokenizer) for text in dataset.get_train().review])
Xtest = np.asarray([format_text(text, tokenizer) for text in dataset.get_test().review])
Xdev = np.asarray([format_text(text, tokenizer) for text in dataset.get_dev().review])

ytrain = to_categorical(dataset.get_train_y()-dataset.get_train_y().min())
ytest = to_categorical(dataset.get_test_y()-dataset.get_test_y().min())
ydev = to_categorical(dataset.get_dev_y()-dataset.get_dev_y().min())

In [None]:
unl_ratio = 0.99 # 1 - unl_ratio = % train
sss = StratifiedShuffleSplit(n_splits=2, test_size=unl_ratio, random_state=0)
for train_index, test_index in sss.split(Xtrain, ytrain):
    X_train, X_unl = Xtrain[train_index], Xtrain[test_index]
    y_train, y_unl = ytrain[train_index], ytrain[test_index]

In [None]:
save_processed("../GAN_BERT/dataset/splitedData/3c_BERT_processed_Data/1percent/Total", "../GAN_BERT/dataset/splitedData/3c_BERT_processed_Data/1percent/Split", split=True)

In [None]:
t, s = load_processed("dataset/splitedData/3c_BERT_processed_Data/1percent/Total", \
               "dataset/splitedData/3c_BERT_processed_Data/1percent/Split", split=True)
Xtrain, ytrain, Xdev, ydev, Xtest, ytest = t
X_train, y_train, X_unl, y_unl = s

## Define Model

### Metrics

In [None]:
class F1(tfa.metrics.F1Score):
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true, y_pred = metric_mask_filter(y_true, y_pred)
        if self.threshold is None:
            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
        else:
            y_pred = y_pred > self.threshold

        y_true = tf.cast(y_true, self.dtype)
        y_pred = tf.cast(y_pred, self.dtype)

        def _weighted_sum(val, sample_weight):
            if sample_weight is not None:
                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
            return tf.reduce_sum(val, axis=self.axis)

        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
        self.false_positives.assign_add(
            _weighted_sum(y_pred * (1 - y_true), sample_weight)
        )
        self.false_negatives.assign_add(
            _weighted_sum((1 - y_pred) * y_true, sample_weight)
        )
        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
        
        
def model_classification_loss(y_true, y_pred):
    #y_true, y_pred = metric_mask_filter(y_true, y_pred)
    #log_probs = tf.nn.log_softmax(y_pred, axis=-1)
    #per_example_loss = -tf.reduce_sum(y_true * log_probs, axis=-1)
    #loss = tf.reduce_mean(per_example_loss)
    
    #return tf.cond(tf.equal(tf.size(y_true), 0), lambda: tf.convert_to_tensor(0.), lambda: tf.reduce_mean(per_example_loss))
    
    y_true, y_pred = metric_mask_filter(y_true, y_pred)
    probs = tf.clip_by_value(tf.nn.softmax(y_pred, axis=-1), epsilon, 1. - epsilon)
    per_example_loss = -tf.reduce_sum(y_true * tf.math.log(probs), axis=-1)
    
    return tf.reduce_mean(per_example_loss)

def myloss2(y_true, y_pred):

    return tf.reduce_mean(tf.math.square(tf.reduce_mean(model.d(model.b(model._X_train_batch))[1], axis=0) - 
                                   tf.reduce_mean(model.d(model.g(model._noise))[1], axis=0)))

def myloss1(y_true, y_pred):

    fake_prob = tf.nn.softmax(y_pred, axis=-1)
    return -1 * tf.reduce_mean(tf.math.log(tf.clip_by_value((1 - fake_prob[:, 0]), epsilon, 1. - epsilon)))

### Optimizer

In [None]:
class custom_optimizer(tf.keras.optimizers.Adam):
    def _clip_gradients(self, grads):
        """Clip gradients according to the clipnorm and clipvalue attributes."""
        if self.clipnorm is not None:
            if distribute_ctx.has_strategy():
                raise ValueError("Gradient clipping in the optimizer "
                                 "(by setting clipnorm or clipvalue) is currently "
                                 "unsupported when using a distribution strategy.")
            #grads = [None if g is None else clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
            grads, _ = clip_ops.clip_by_global_norm(grads, self.clipnorm)
        if self.clipvalue is not None:
            if distribute_ctx.has_strategy():
                raise ValueError("Gradient clipping in the optimizer "
                                 "(by setting clipnorm or clipvalue) is currently "
                                 "unsupported when using a distribution strategy.")
            v = self.clipvalue
            grads = [None if g is None else clip_ops.clip_by_value(g, -v, v) for g in grads]
        return grads
    
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Applys a warmup schedule on a given learning rate decay schedule."""

    def __init__(
            self,
            initial_learning_rate,
            decay_schedule_fn,
            warmup_steps,
            power=1.0,
            name=None):
        super(WarmUp, self).__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.power = power
        self.decay_schedule_fn = decay_schedule_fn
        self.name = name

    def __call__(self, step):
        with tf.name_scope(self.name or 'WarmUp') as name:
            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
            # learning rate will be `global_step/num_warmup_steps * init_lr`.
            global_step_float = tf.cast(step, tf.float32)
            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
            warmup_percent_done = global_step_float / warmup_steps_float
            warmup_learning_rate = (
                self.initial_learning_rate *
                tf.math.pow(warmup_percent_done, self.power))
            return tf.cond(global_step_float < warmup_steps_float,
                           lambda: warmup_learning_rate,
                           lambda: self.decay_schedule_fn(step),
                           name=name)

    def get_config(self):
        return {
            'initial_learning_rate': self.initial_learning_rate,
            'decay_schedule_fn': self.decay_schedule_fn,
            'warmup_steps': self.warmup_steps,
            'power': self.power,
            'name': self.name
        }
    
def PolynomialDecay_learning_rate(init_lr, num_train_steps, num_warmup_steps):
    
    learning_rate = tf.optimizers.schedules.PolynomialDecay(
        init_lr,
        num_train_steps)
    if num_warmup_steps:
        learning_rate = WarmUp(initial_learning_rate=init_lr,
                                  decay_schedule_fn=learning_rate,
                                  warmup_steps=num_warmup_steps)
    return learning_rate

### Model

In [None]:
class SSGAN_BERT():
    def __init__(self, path_bert_model=path_bert_model, split_data=None, adapter_size = 4, adapter_init_scale = 1e-5, \
                 closs_w=1, g_lr=4e-5, d_lr=4e-5, max_seq_len = 250, bilstm=True, latent_dim = 100, g_output_size = 500, \
                 d_hidden_size=128, g_hidden_size=500, d_num_hidden_discriminator = 1, d_drop_out = 0.3, \
                 g_num_hidden_discriminator = 3, g_drop_out = 0.3, num_labels = 4):
        
        self.path_bert_model = path_bert_model
        self.split_data = split_data
        self.adapter_size = adapter_size
        self.adapter_init_scale = adapter_init_scale
        self.closs_w = closs_w
        self.g_lr = g_lr
        self.d_lr = d_lr
        self.max_seq_len = max_seq_len
        self.bilstm = bilstm
        self.latent_dim = latent_dim
        self.g_output_size = g_output_size
        self.d_hidden_size = d_hidden_size
        self.g_hidden_size = g_hidden_size
        self.d_num_hidden_discriminator = d_num_hidden_discriminator
        self.d_drop_out = d_drop_out
        self.g_num_hidden_discriminator = g_num_hidden_discriminator
        self.g_drop_out = g_drop_out
        self.num_labels = num_labels
        self.val_f1 = 0.0
        self._noise = None
        self._X_train_batch = None
        self.create_models()
        self.optimizers = None

    def create_models(self):
        self.b = self.bert_model(self.max_seq_len, self.path_bert_model, self.adapter_size, self.adapter_init_scale)

        self.d = self.discriminator(self.b.output.shape[-1], self.d_hidden_size, self.d_num_hidden_discriminator, self.d_drop_out, self.num_labels)
        self.D_Model = keras.Model(self.b.input, self.d(self.b.output), name='GAN_Discriminator')

        self.g = self.generator(self.latent_dim, self.g_output_size, self.g_hidden_size, self.g_num_hidden_discriminator, self.g_drop_out)
        self.G_Model = keras.Model(self.g.input, self.d(self.g.output), name='GAN_Generator')
        
    def compile_models(self):
    
        #num_train_steps = int(num_train_examples / batch_size * epochs)
        #num_warmup_steps = int(num_train_steps * warm_up_steps)
        #D_model_lr, G_model_lr = self.d_lr, self.g_lr
        #if decay:
            #D_model_lr = PolynomialDecay_learning_rate(self.d_lr, num_train_steps, num_warmup_steps)
            #G_model_lr = PolynomialDecay_learning_rate(self.g_lr, num_train_steps, num_warmup_steps)
        
        if self.optimizers:
            D_Model_optimizer = self.optimizers[self.D_Model.name]
            G_Model_optimizer = self.optimizers[self.G_Model.name]
        else:
            D_Model_optimizer = custom_optimizer(learning_rate=self.d_lr, clipnorm=1., beta_1=0.5) #tf.keras.optimizers.Adam(learning_rate=self.d_lr),#'Adam' 
            G_Model_optimizer = custom_optimizer(learning_rate=self.g_lr, clipnorm=1., beta_1=0.5)
            
        self.D_Model.compile(loss={'discriminator':self.D_Model_loss()},
                             optimizer=D_Model_optimizer,
                             metrics={'discriminator': [model_classification_loss, F1(num_classes=(self.num_labels-1), average='macro')]})
        
        self.d.trainable = False
        
        self.G_Model.compile(loss={'discriminator':self.G_Model_loss()},
                             optimizer=G_Model_optimizer,
                             metrics={'discriminator': [myloss1, myloss2]}) #tf.keras.optimizers.Adam(learning_rate=self.g_lr))#'Adam'
    
    def save_models(self, path):
    
        if not os.path.exists(path):
            os.makedirs(path)
        
        for model in (self.D_Model, self.G_Model):
            
            model.save_weights(path+'/'+model.name+'.h5')
            
            with open(path+'/optimizer_'+model.name, 'wb') as f:
                pickle.dump(model.optimizer, f)
        
    def load_models(self, path):
        
        self.optimizers = {}
        for model in (self.D_Model, self.G_Model):
            model.load_weights(path+'/'+model.name+'.h5')
            
            with open(path+'/optimizer_'+model.name, 'rb') as f:
                self.optimizers[model.name] = pickle.load(f)                
        
    def G_Model_loss(self):
        
        def loss (y_true, y_pred):
            
            fake_prob = tf.nn.softmax(y_pred, axis=-1)
            
            g_loss_1 = -1 * tf.reduce_mean(tf.math.log(tf.clip_by_value((1 - fake_prob[:, 0]), epsilon, 1. - epsilon)))
            
            g_loss_2 = tf.reduce_mean(tf.math.square(tf.reduce_mean(self.d(self.b(self._X_train_batch))[1], axis=0) - 
                                           tf.reduce_mean(self.d(self.g(self._noise))[1], axis=0)))            
            return g_loss_1 + g_loss_2
        
        return loss
    
    def D_Model_loss(self):
            
        def loss(y_true, y_pred):
            
            prob = tf.clip_by_value(tf.nn.softmax(y_pred, axis=-1), epsilon, 1. - epsilon)
            
            ### SUPERVISED ###
            
            logits_y_pred = y_pred[:,1:]
            
            log_probs= tf.clip_by_value(tf.nn.softmax(logits_y_pred, axis=-1), epsilon, 1. - epsilon)
            log_probs = tf.math.log(log_probs)
            
            one_hot_labels = y_true[:,1:]
            label_mask = get_label_mask(y_true)
            
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            per_example_loss = tf.boolean_mask(per_example_loss, label_mask)
            
            labeled_example_count = tf.cast(tf.size(per_example_loss), tf.float32)
            D_L_Supervised = tf.divide(tf.reduce_sum(per_example_loss), tf.maximum(labeled_example_count, 1)) #* self.closs_w
            
            ### UNSUPERVISED ###
            
            #### REAL EXAMPLES ####
            
            D_L_unsupervised1U = -1 * tf.reduce_mean(tf.math.log(tf.clip_by_value((1 - prob[:, 0]), epsilon, 1. - epsilon)))
            
            #### FAKE EXAMPLES ####
            
            #X_noise = tf.random.uniform([tf.shape(y_pred)[0], self.latent_dim], minval=0, maxval=1, dtype=tf.float32)
            logits_fake = self.d(self.g(self._noise))[0]
            fake_prob = tf.nn.softmax(logits_fake, axis=-1)
            fake_prob = tf.clip_by_value(fake_prob, epsilon, 1. - epsilon)

            D_L_unsupervised2U = -1 * tf.reduce_mean(tf.math.log(fake_prob[:, 0]))
            
            return  D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U
        
        return loss
    
    def discriminator(self, d_input_size, d_hidden_size, num_hidden_discriminator, dropout, num_labels):
        shared_model_input = keras.layers.Input(shape=(d_input_size,), dtype='float32', name='d_l_i')
        hidden_layer = keras.layers.Dropout(dropout, name='d_dp_i')(shared_model_input)
        for i in range(num_hidden_discriminator):
            hidden_layer = keras.layers.Dense(d_hidden_size, activation=tf.keras.layers.LeakyReLU(), name='d_l_'+str(i))(hidden_layer)
            hidden_layer = keras.layers.Dropout(dropout, name='d_dp_'+str(i))(hidden_layer)
        
        #last_layer = keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(), name='d128_last')(hidden_layer)
        last_layer = keras.layers.Dense(num_labels, name='d_last')(hidden_layer)

        return keras.Model(shared_model_input, [last_layer, hidden_layer], name='discriminator')

    def generator(self, latent_dim, g_output_size, g_hidden_size, num_hidden_discriminator, dropout):
        shared_model_input = keras.layers.Input(shape=(latent_dim,), dtype='float32', name='g_l_i')
        hidden_layer = shared_model_input
        for i in range(num_hidden_discriminator):
            hidden_layer = keras.layers.Dense(g_hidden_size, activation=tf.keras.layers.LeakyReLU(), name='g_l_'+str(i))(hidden_layer)#keras.layers.ReLU()
            hidden_layer = keras.layers.Dropout(dropout, name='g_dp_'+str(i))(hidden_layer)
        last_hidden_layer = keras.layers.Dense(g_output_size, activation='tanh', name='g_l_f')(hidden_layer)#keras.layers.LeakyReLU() #'tanh', activation='tanh'
        return keras.Model(shared_model_input, last_hidden_layer, name='generator')

    def bert_model(self, max_seq_len, path_bert_model, adapter_size, adapter_init_scale):
        l_input_ids = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32', name='input_layer')

        bert_params = bert.params_from_pretrained_ckpt(path_bert_model)
        bert_params.adapter_size = adapter_size
        bert_params.adapter_init_scale = adapter_init_scale
        l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")

        embedded_sequences = l_bert(l_input_ids) # output: [batch_size, max_seq_len, hidden_size]
        
        if self.bilstm:
            drop_out_1 = tf.keras.layers.Dropout(self.d_drop_out, name='drop_out_1')(embedded_sequences)
            cls_out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(250))(drop_out_1)
            #biLSTM_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(250))(drop_out_1)
            #cls_out = tf.keras.layers.Dropout(self.d_drop_out, name='drop_out_2')(biLSTM_1)
        else:
            cls_out = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :], name='bert_cls')(embedded_sequences)

        model = tf.keras.Model(inputs=l_input_ids, outputs=cls_out, name='BERT')
        model.build(input_shape=(None,max_seq_len))

        l_bert.apply_adapter_freeze()
        bert_ckpt_file = os.path.join(path_bert_model, "bert_model.ckpt")
        bert.load_stock_weights(l_bert, bert_ckpt_file)
        return model
    
    
    def train_generator_by_rep(self, G_loss, D_loss, valid, rep):
        aux = div = 1
        aux_g_loss = aux_g_b_loss = aux_g_match_loss = 0
        if G_loss > D_loss:
                aux = div = rep
        while aux:
            _, g_loss, g_b_loss, g_match_loss = self.G_Model.train_on_batch(self._noise, valid)
            aux_g_loss += g_loss
            aux_g_b_loss += g_b_loss
            aux_g_match_loss += g_match_loss
            aux -= 1
        return aux_g_loss/div, aux_g_b_loss/div, aux_g_match_loss/div
    
    def train_generator_by_lr_update(self, G_loss, D_loss, valid, g_lr_update):
        
        if G_loss > D_loss:
            self.G_Model.optimizer.lr.assign(g_lr_update)
        
        _, g_loss, g_b_loss, g_match_loss = self.G_Model.train_on_batch(self._noise, valid)
        
        self.G_Model.optimizer.lr.assign(self.g_lr)
        
        return g_loss, g_b_loss, g_match_loss
    
    def train(self, X, y, epochs, batch_size=32, X_dev=None, y_dev=None, unl_ratio=0.95, g_rep=10, g_lr_update=1e-3, checkpoint=None, chck_path=None):
        #---------------
        # inicializar variables
        #---------------
        
        val_f1 = D_loss = C_loss = D_f1 = G_loss = G_match_loss = G_b_loss = 0.0
        
        #---------------
        # compilando modelos
        #---------------
        print("Compilando modelos...", end="\r")
        
        if checkpoint:
            self.load_models(checkpoint)
            
        self.compile_models()
            
        #---------------
        # datos reales sin etiqueta
        #---------------
    
        print("Configurando lotes de datos...", end="\r")
        
        if self.split_data == None:
            sss = StratifiedShuffleSplit(n_splits=2, test_size=unl_ratio, random_state=0)
            for train_index, test_index in sss.split(X, y):
                X_train, X_unl = X[train_index], X[test_index]
                y_train, y_unl = y[train_index], y[test_index]
        else:
            X_train, y_train, X_unl, y_unl = self.split_data
            
        y_unl = np.zeros((y_unl.shape[0], 4))
        y_unl[:,0] = 1
        y_train = np.concatenate((np.zeros((y_train.shape[0], 1)), y_train), axis=-1)
        
        X_ttrain = np.concatenate((X_train, X_unl), axis=0)
        y_ttrain = np.concatenate((y_train, y_unl), axis=0)
        
        #---------------
        # epoch para training
        #---------------
        print("Evaluando métricas iniciales...", end="\r")
        
        for epoch in range(epochs):
            
            _d_loss = _c_loss = _d_f1 = _g_loss = _g_b_loss = _g_match_loss = 0.0
            
            #return X_ttrain, y_ttrain, batch_size
            X_train, y_train = StratifiedBatches([X_ttrain, y_ttrain], batch_size)
            self._noise = np.random.uniform(0, 1, (X_dev.shape[0], self.latent_dim))
            steps_4_epoch = len(X_train)
            
            if X_dev is not None and y_dev is not None:# and epoch>0:
                y_dev_ = np.concatenate((np.zeros((y_dev.shape[0], 1)), y_dev), axis=-1)
                _, _, val_c_loss, val_f1 = self.D_Model.evaluate(X_dev, y_dev_, verbose=0)
                if val_f1 > self.val_f1:
                    self.val_f1 = val_f1
                    self.save_models(chck_path)
                    
            for batch in range(0, steps_4_epoch):
                
                if np.isnan(np.sum(X_train[batch])) or np.isnan(np.sum(y_train[batch])):
                    print("nan exists in this batch")
                    return X_train[batch], y_train[batch]
                
                self._noise = np.random.uniform(0, 1, (len(X_train[batch]), self.latent_dim))
                self._X_train_batch = X_train[batch]
                
                # ---------------------
                #  Train Discriminator with bert D_MODEL
                # ---------------------
                
                _, d_loss, c_loss, d_f1 = self.D_Model.train_on_batch(np.array(X_train[batch]), np.array(y_train[batch]))
                    
                # ---------------------
                #  Train Generator G_MODEL
                # ---------------------

                valid = np.zeros((X_train[batch].shape[0], 4))
                
                _, g_loss, g_b_loss, g_match_loss = self.G_Model.train_on_batch(self._noise, valid)
                #g_loss, g_b_loss, g_match_loss = self.train_generator_by_rep(G_loss, D_loss, valid, g_rep)
                #g_loss, g_b_loss, g_match_loss = self.train_generator_by_lr_update(G_loss, D_loss, valid, g_lr_update)
                
                # ---------------------
                #  Calc and Plot Metrics
                # ---------------------
                
                _d_loss += d_loss
                _c_loss += c_loss
                _d_f1 += d_f1
                _g_loss += g_loss
                _g_b_loss += g_b_loss
                _g_match_loss += g_match_loss
                
                D_loss = _d_loss/(batch+1)
                C_loss = _c_loss/(batch+1) 
                D_f1 = _d_f1/(batch+1) 
                G_loss = _g_loss/(batch+1)
                G_b_loss = _g_b_loss/(batch+1)
                G_match_loss = _g_match_loss/(batch+1)
                
                # Plot the progress
                log = "Epoch:"+str(epoch+1)+" Batch:("+str(batch+1)+"/"+str(steps_4_epoch)+")"
                log = log+" [loss: "+"{:.5f}".format(C_loss)+ ", macroF1: "+"{:.5f}".format(D_f1)+"]"
                log = log+" [D loss: "+"{:.5f}".format(D_loss)+", G loss: "+"{:.5f}".format(G_loss)+"]"
                log = log+" [G b: "+"{:.5f}".format(G_b_loss)+", G match: " + "{:.5f}".format(G_match_loss) + "]"
                if X_dev is not None and y_dev is not None:
                    log = log+" [val_loss: "+"{:.5f}".format(val_c_loss)+ ", val_macroF1: "+"{:.5f}".format(val_f1)+"]"
                print(log, end="\r")
                    
            print("")
            
        self.save_models(chck_path+"_F") 
            

## Instantiate and train model

In [None]:
model = SSGAN_BERT(path_bert_model=path_bert_model, split_data=s, closs_w=1)

In [None]:
model.train(Xtrain, ytrain, X_dev=Xdev, y_dev=ydev, epochs=400, batch_size=200, chck_path='./model/T128_1percent_beta1_05') # macro 0.63053

## Evaluate model on test dataset

In [None]:
y_pred = model.D_Model.predict(Xtest)
y_test = ytest.argmax(axis=1)
result = y_pred[0][:,1:].argmax(axis=1)
print(classification_report(y_test, result, digits=4))

## Load last saved model

In [None]:
last_model = SSGAN_BERT(path_bert_model=path_bert_model, split_data=s, closs_w=1)
last_model.load_models('./model/T128_1percent_beta1_05')

In [None]:
y_pred = last_model.D_Model.predict(Xtest)
y_test = ytest.argmax(axis=1)
result = y_pred[0][:,1:].argmax(axis=1)
print(classification_report(y_test, result, digits=4))