Só conferindo se o gerador/discriminador do GANBERT tá fazendo o que devia

In [2]:
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import time
import math
import datetime
import torch.nn as nn
from transformers import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install sentencepiece

##Set random values
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060


In [4]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [5]:
# number of hidden layers in the generator, 
# each of the size of the output space
num_hidden_layers_g = 2; 
# number of hidden layers in the discriminator, 
# each of the size of the input space
num_hidden_layers_d = 1; 
# size of the generator's input noisy vectors
noise_size = 100
# dropout to be applied to discriminator's input vectors
out_dropout_rate = 0.2
#--------------------------------
#  Optimization parameters
#--------------------------------
learning_rate_discriminator = 5e-5
learning_rate_generator = 5e-5
epsilon = 1e-8
num_train_epochs = 5
multi_gpu = True
# Scheduler
apply_scheduler = False
warmup_proportion = 0.1
# Print
print_each_n_step = 100

batch_size = 32
max_length = 128

model_name =  "neuralmind/bert-base-portuguese-cased"

In [6]:
unlabeled = pd.read_csv("data/lrec2020.csv",sep=";").head(10000)

In [7]:
transformer = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json from cache at /home/augusto/.cache/huggingface/transformers/e716e2151985ba669e7197b64cdde2552acee146494d40ffaf0688a3f152e6ed.18a0b8b86f3ebd4c8a1d8d6199178feae9971ff5420f1d12f0ed8326ffdff716
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_to

In [8]:
import re
temp = []
for example in unlabeled.Text:
  clean_tweet = re.sub("@[A-Za-z0-9_]+","", example)
  clean_tweet = re.sub("#[A-Za-z0-9_]+","", clean_tweet)
  clean_tweet = re.sub("https:[A-Za-z0-9_/.]+","", clean_tweet)
  clean_tweet = re.sub("http:[A-Za-z0-9_/.]+","", clean_tweet)
  temp.append(clean_tweet)
unlabeled_examples = temp
del temp

In [9]:
len(unlabeled_examples)

10000

In [10]:
series = pd.Series(unlabeled_examples)
train_examples = series.sample(frac=0.8,random_state=10)
test_examples = series.iloc[~series.index.isin(train_examples.index)]

train_examples = list(train_examples)
test_examples = list(test_examples)

In [11]:
def generate_dataloader(examples,tokenizer,batch_size,do_shuffle=False):
    data = []
    input_ids = []
    attention_masks = []
    
    for i,text in enumerate(examples):
        if i % 100000 == 0:
            print(i," de ",len(examples))
        encoded = tokenizer.encode_plus(text,return_attention_mask=True,add_special_tokens=True,max_length = max_length,
    padding="max_length",truncation=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    labels = torch.ones(len(examples))
    
    dataset = torch.utils.data.TensorDataset(input_ids,attention_masks,labels)
    
    if do_shuffle:
        sampler = torch.utils.data.RandomSampler
    else:
        sampler = torch.utils.data.SequentialSampler
    return torch.utils.data.DataLoader(
        dataset = dataset,
        sampler = sampler(dataset),
        batch_size = batch_size
    )
    

In [12]:
train_dataloader = generate_dataloader(unlabeled_examples,tokenizer,batch_size)
#test_dataloader = generate_dataloader(test_examples,tokenizer,batch_size)

0  de  10000


In [13]:
class Generator(nn.Module):
    def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
        super(Generator, self).__init__()
        layers = []
        hidden_sizes = [noise_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        layers.append(nn.Linear(hidden_sizes[-1],output_size))
        self.layers = nn.Sequential(*layers)

    def forward(self, noise):
        output_rep = self.layers(noise)
        return output_rep
    

class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
        super(Discriminator, self).__init__()
        self.input_dropout = nn.Dropout(p=dropout_rate)
        layers = []
        hidden_sizes = [input_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        self.layers = nn.Sequential(*layers) #per il flatten
        self.logit = nn.Linear(hidden_sizes[-1],num_labels) 
        self.sigmoid = torch.sigmoid

    def forward(self, input_rep):
        input_rep = self.input_dropout(input_rep)
        last_rep = self.layers(input_rep)
        logits = self.logit(last_rep)
        probs = self.sigmoid(logits)
        return last_rep, logits, probs

In [14]:
# The config file is required to get the dimension of the vector produced by 
# the underlying transformer
config = AutoConfig.from_pretrained(model_name)
hidden_size = int(config.hidden_size)
# Define the number and width of hidden layers
hidden_levels_g = [hidden_size for i in range(0, num_hidden_layers_g)]
hidden_levels_d = [hidden_size for i in range(0, num_hidden_layers_d)]

#-------------------------------------------------
#   Instantiate the Generator and Discriminator
#-------------------------------------------------
generator = Generator(noise_size=noise_size, output_size=hidden_size, hidden_sizes=hidden_levels_g, dropout_rate=out_dropout_rate)
discriminator = Discriminator(input_size=hidden_size, hidden_sizes=hidden_levels_d, num_labels=1, dropout_rate=out_dropout_rate)

# Put everything in the GPU if available
if torch.cuda.is_available():    
  generator.cuda()
  discriminator.cuda()
  transformer.cuda()
  if multi_gpu:
    transformer = torch.nn.DataParallel(transformer)


loading configuration file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json from cache at /home/augusto/.cache/huggingface/transformers/e716e2151985ba669e7197b64cdde2552acee146494d40ffaf0688a3f152e6ed.18a0b8b86f3ebd4c8a1d8d6199178feae9971ff5420f1d12f0ed8326ffdff716
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_to

In [15]:
transformer_vars = [i for i in transformer.parameters()]
d_vars = transformer_vars + [v for v in discriminator.parameters()]
g_vars = [v for v in generator.parameters()]

#optimizer
dis_optimizer = torch.optim.AdamW(d_vars, lr=learning_rate_discriminator)
gen_optimizer = torch.optim.AdamW(g_vars, lr=learning_rate_generator) 

#scheduler
if apply_scheduler:
  num_train_examples = len(train_examples)
  num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

  scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
  scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                           num_warmup_steps = num_warmup_steps)

In [16]:
training_stats = []

for epoch_i in range(0,num_train_epochs):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()
    
    tr_g_loss = 0
    tr_d_loss = 0
    
    transformer.train()
    generator.train()
    discriminator.train()
    
    for step,batch in enumerate(train_dataloader):
        
        if step % print_each_n_step == 0 and not step == 0:
            
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        real_batch_size = b_input_ids.shape[0]
        
        model_outputs = transformer(b_input_ids,attention_mask=b_input_mask)
        hidden_states = model_outputs[-1]
        
        
        noise = torch.zeros(real_batch_size, noise_size, device=device).uniform_(0,1)
        gen_rep = generator(noise)
        
        discriminator_input = torch.cat([hidden_states,gen_rep],dim=0)
        
        features, logits, probs = discriminator(discriminator_input)
        
        features_list = torch.split(features, real_batch_size)
        D_real_features = features_list[0]
        D_fake_features = features_list[1]
      
        logits_list = torch.split(logits, real_batch_size)
        D_real_logits = logits_list[0]
        D_fake_logits = logits_list[1]
        
        probs_list = torch.split(probs, real_batch_size)
        D_real_probs = probs_list[0]
        D_fake_probs = probs_list[1]        
        
        g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features,dim=0) - torch.mean(D_fake_features,dim=0),2))
        #g_feat_reg_alt = torch.mean(torch.pow(torch.mean(D_real_features - D_fake_features),2))
        g_loss_d = -1 * torch.mean(torch.log(1- D_fake_probs + epsilon))
        g_loss = g_loss_d + g_feat_reg
        
        

        D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs + epsilon))
        D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs + epsilon))
        d_loss = D_L_unsupervised1U + D_L_unsupervised2U
        
        
        
        gen_optimizer.zero_grad()
        dis_optimizer.zero_grad()
        
        g_loss.backward(retain_graph=True)
        d_loss.backward()
        
        gen_optimizer.step()
        dis_optimizer.step()
        
        tr_g_loss += g_loss.item()
        tr_d_loss += d_loss.item()
        
        if apply_scheduler:
            scheduler_d.step()
            scheduler_g.step()
    avg_train_loss_g = tr_g_loss / len(train_dataloader)
    avg_train_loss_d = tr_d_loss / len(train_dataloader)
    
    training_time = format_time(time.time() - t0)
    
    print("")
    print("  Average training loss generator: {0:.3f}".format(avg_train_loss_g))
    print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
    
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss generator': avg_train_loss_g,
            'Training Loss discriminator': avg_train_loss_d            
        }
    )


Training...
  Batch   100  of    313.    Elapsed: 0:01:06.
  Batch   200  of    313.    Elapsed: 0:02:12.
  Batch   300  of    313.    Elapsed: 0:03:18.

  Average training loss generator: 0.715
  Average training loss discriminator: 0.751

Training...
  Batch   100  of    313.    Elapsed: 0:01:06.
  Batch   200  of    313.    Elapsed: 0:02:12.
  Batch   300  of    313.    Elapsed: 0:03:19.

  Average training loss generator: 0.710
  Average training loss discriminator: 0.722

Training...
  Batch   100  of    313.    Elapsed: 0:01:06.
  Batch   200  of    313.    Elapsed: 0:02:13.
  Batch   300  of    313.    Elapsed: 0:03:21.

  Average training loss generator: 0.707
  Average training loss discriminator: 0.715

Training...
  Batch   100  of    313.    Elapsed: 0:01:09.
  Batch   200  of    313.    Elapsed: 0:02:18.
  Batch   300  of    313.    Elapsed: 0:03:24.

  Average training loss generator: 0.705
  Average training loss discriminator: 0.710

Training...
  Batch   100  of    31

In [None]:
#conclusion: working as intended but might be reeeeeal slow about it.