In [None]:
!pip install transformers
!pip install kaggle
!pip install wandb

In [None]:
%cd /content/

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import files
files.upload()

import wandb
wandb.login()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d alzoqm/ai-hub-encode-dataset

Downloading ai-hub-encode-dataset.zip to /content
100% 3.06G/3.07G [00:33<00:00, 82.2MB/s]
100% 3.07G/3.07G [00:33<00:00, 98.6MB/s]


In [None]:
!unzip /content/ai-hub-encode-dataset.zip -d /content/dataset

In [None]:
!rm /content/ai-hub-encode-dataset.zip

In [None]:
import numpy as np
import tensorflow as tf
import tqdm
import transformers
import random
import os
import tensorflow.distribute as tfd

from tqdm import tqdm

In [None]:
%cd /content/

/content


In [None]:
vocab = transformers.BertTokenizer('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/vocab.txt', do_lower_case=False, strip_accents=False)

In [None]:
%cd /content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset

/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset


In [None]:
import electra_model as electra
import make_pretrain_dataset_step2 as dataset_fn

In [None]:
#model hyper parameter는 electra small++과 동일하게 설정
config = {}
config['num_classes'] = 2
config['max_len'] = 512
config['seg_type'] = 2
config['vocab_size'] = 32200
config['gen_num_layers'] = 12
config['gen_dff'] = 1024
config['gen_d_model'] = 256
config['gen_emb_size'] = 128
config['gen_num_heads'] = 4
config['gen_dropout'] = 0.1
config['dis_num_layers'] = 12
config['dis_dff'] = 1024
config['dis_d_model'] = 256
config['dis_emb_size'] = 128
config['dis_num_heads'] = 4
config['dis_dropout'] = 0.1
config['dis_lambda'] = 50
config['batch_size'] = 512
config['lr'] = 5e-4
config['epoch'] = 10

#base parameter
# config = {}
# config['num_classes'] = 2
# config['max_len'] = 512
# config['seg_type'] = 2
# config['vocab_size'] = 32200
# config['gen_num_layers'] = 12
# config['gen_dff'] = 1024
# config['gen_d_model'] = 256
# config['gen_emb_size'] = 768
# config['gen_num_heads'] = 4
# config['gen_dropout'] = 0.1
# config['dis_num_layers'] = 12
# config['dis_dff'] = 768 * 4
# config['dis_d_model'] = 768
# config['dis_emb_size'] = 768
# config['dis_num_heads'] = 12
# config['dis_dropout'] = 0.1
# config['dis_lambda'] = 50
# config['batch_size'] = 16
# config['lr'] = 3e-5
# config['epoch'] = 10

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
n_path = '/content/dataset/next_pred'
r_path = '/content/dataset/random_pred'

n_path_list, r_path_list = dataset_fn.make_json_list(n_path, r_path)

In [None]:
if os.path.isdir('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/weight_folder') == False:
  os.mkdir('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/weight_folder')
save_path = '/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/weight_folder/save_small_pretrain.h5'
gen_save_path = '/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/weight_folder/gen_save_small_pretrain.h5'
dis_save_path = '/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/weight_folder/dis_save_small_pretrain.h5'

with strategy.scope():
  tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
  model = electra.ElectraPretrain(config)
  optimizer = tf.keras.optimizers.Adam(learning_rate=config['lr'])
  x = tf.range(512*1)
  x = tf.reshape(x, shape=(-1, 512))
  test_batch_size = x.shape[0]
  seg = tf.ones_like(x)
  label_cls = tf.ones(shape=(test_batch_size, ))
  total_loss, gen_loss, sampling, dis_loss = model([x, seg, label_cls, x])
  model.summary()
  #model.load_weights(save_path)

Model: "electra_pretrain"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 generator (Generator)       multiple                  9575936   
                                                                 
 discriminator (Discriminato  multiple                 13763456  
 r)                                                              
                                                                 
 embedding (Embedding)       multiple                  4121600   
                                                                 
 embedding_1 (Embedding)     multiple                  256       
                                                                 
 embedding_2 (Embedding)     multiple                  65664     
                                                                 
 dense_148 (Dense)           multiple                  512       
                                                  

In [None]:
tf.keras.mixed_precision.global_policy()

<Policy "mixed_bfloat16">

In [None]:
lr = 2e-4
for i in range(10):
  lr = lr*tf.math.exp(-0.04)
print(lr)

tf.Tensor(0.000134064, shape=(), dtype=float32)


In [None]:
def train_step(inputs):
  sentences, segments, labels_cls, labels_lm = inputs

  with tf.GradientTape() as tape:
    total_loss, gen_loss, sampling, dis_loss = model([sentences, segments, labels_cls, labels_lm], training=True)
  gradients = tape.gradient(total_loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  return tf.reduce_mean(total_loss), tf.reduce_mean(gen_loss), sampling, tf.reduce_mean(dis_loss)

@tf.function
def distributed_train_step(inputs):
  per_total_loss, per_gen_loss, _, per_dis_loss = strategy.run(train_step, args = (inputs, ))
  return strategy.reduce(tfd.ReduceOp.MEAN, per_total_loss, axis=None), strategy.reduce(tfd.ReduceOp.MEAN, per_gen_loss, axis=None), strategy.reduce(tfd.ReduceOp.MEAN, per_dis_loss, axis=None)

if os.path.isdir('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/save_index_folder') == False:
  os.mkdir('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/save_index_folder')

for epoch in range(config['epoch']):
  total_losses = 0.
  gen_losses = 0.
  dis_losses = 0.
  one_batch = 0
  n_path_list, r_path_list = dataset_fn.make_json_list(n_path, r_path)
  random.shuffle(n_path_list)
  random.shuffle(r_path_list)
  with open('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/save_index_folder/n_index.txt', 'w') as n_f:
    n_f.writelines(n_path_list)
  with open('/content/drive/MyDrive/ColabNotebooks/project/ELECTRA_with_AIHUB_dataset/save_index_folder/r_index.txt', 'w') as r_f:
    r_f.writelines(r_path_list)

  index_len = len(n_path_list) if len(n_path_list) <= len(r_path_list) else len(r_path_list)
  add_connect = False
  with tqdm(total=index_len, desc=f"epoch: {epoch+1}") as pbar:
    for step in range(index_len):
      per_sentences, per_segments, per_labels_cls, per_labels_lm  = dataset_fn.make_dataset(step, step, n_path_list, r_path_list, n_path, r_path, config['max_len'], 0)
      if add_connect == True:
        sentences = np.concatenate([sentences, per_sentences], axis=0)
        segments = np.concatenate([segments, per_segments], axis=0)
        labels_cls = np.concatenate([labels_cls, per_labels_cls], axis=0)
        labels_lm = np.concatenate([labels_lm, per_labels_lm], axis=0)
        add_connect = False
      else:
        sentences = per_sentences
        segments = per_segments
        labels_cls = per_labels_cls
        labels_lm = per_labels_lm
      
      if len(sentences) < config['batch_size'] and step != (index_len - 1):
        add_connect = True
        if step % 1700 == 99:
          print(f"epoch: {epoch}, step: {step}, total_loss: {total_losses/one_batch}, gen_loss: {gen_losses/one_batch}, dis_loss: {dis_losses/one_batch}")
          model.save_weights(save_path, overwrite=True)
          model.generator.save(gen_save_path)
          model.discriminator.save(dis_save_path)
        pbar.update(1)
        continue

      if len(sentences) >= config['batch_size']:
        step_epochs = len(sentences) // config['batch_size']
        for step_epoch in range(step_epochs):
          dataset = tf.data.Dataset.from_tensor_slices((
              sentences[step_epoch*config['batch_size']:(step_epoch*config['batch_size']) + config['batch_size']],
              segments[step_epoch*config['batch_size']:(step_epoch*config['batch_size']) + config['batch_size']],
              labels_cls[step_epoch*config['batch_size']:(step_epoch*config['batch_size']) + config['batch_size']],
              labels_lm[step_epoch*config['batch_size']:(step_epoch*config['batch_size']) + config['batch_size']]
          ))
          dataset = dataset.cache()
          dataset = dataset.shuffle(config['batch_size'])
          dataset = dataset.batch(config['batch_size'])
          dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
          dataset = strategy.experimental_distribute_dataset(dataset)

          for x in dataset:
            total_loss, gen_loss, dis_loss = distributed_train_step(x)
            total_loss = tf.reduce_mean(total_loss)
            gen_loss = tf.reduce_mean(gen_loss)
            dis_loss = tf.reduce_mean(dis_loss)
            one_batch += 1
            total_losses += total_loss
            gen_losses += gen_loss
            dis_losses += dis_loss


      if step % 1000 == 99:
        print(f"epoch: {epoch}, step: {step}, total_loss: {total_losses/one_batch}, gen_loss: {gen_losses/one_batch}, dis_loss: {dis_losses/one_batch}")
        model.save_weights(save_path, overwrite=True)
        model.generator.save(gen_save_path)
        model.discriminator.save(dis_save_path)
        #optimizer.learning_rate = optimizer.learning_rate * tf.math.exp(-0.04)        
      pbar.update(1)

  print(f"epoch: {epoch+1}, total_loss: {total_losses/one_batch}, gen_loss: {gen_losses/one_batch}, dis_loss: {dis_losses/one_batch}")
  print("one epoch end")
  model.save_weights(save_path, overwrite=True)
  model.generator.save(gen_save_path)
  model.discriminator.save(dis_save_path)