In [1]:
#pra conseguir algum erro util da porra do torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import random
import time
import datetime


import torch
import torch.nn as nn
from torchsummary import summary
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_constant_schedule_with_warmup, BertForSequenceClassification

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler, RandomSampler



import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from skmultilearn.skmultilearn.model_selection.iterative_stratification import iterative_train_test_split, IterativeStratification
from sklearn.metrics import classification_report, average_precision_score, f1_score

# Utils

In [3]:
class EarlyStopper:
    def __init__(self, patience=4, min_delta=0,is_loss=True):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.is_loss = is_loss
        if is_loss:
            self.min_validation_loss = np.inf
        else:
            self.min_validation_loss = -np.inf

    def early_stop(self, validation_loss):
        if self.is_loss:
            if validation_loss < self.min_validation_loss:
                self.min_validation_loss = validation_loss
                self.counter = 0
                print("NEW LOWEST LOSS ",self.min_validation_loss)
            elif validation_loss >= (self.min_validation_loss + self.min_delta):
                self.counter += 1
                if self.counter >= self.patience:
                    return True
            return False
        
        else:
            if validation_loss > self.min_validation_loss:
                self.min_validation_loss = validation_loss
                self.counter = 0
                print("NEW HIGHEST SCORE ",self.min_validation_loss)
            elif validation_loss <= (self.min_validation_loss + self.min_delta):
                self.counter += 1
                if self.counter >= self.patience:
                    return True
            return False

In [4]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Hiperparâmetros

In [5]:
seed_val = 2023
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)
  device = 0

In [6]:
learning_rate = 5e-5
num_train_epochs = 20
print_each_n_step = 100
batch_size= 16
warmup_proportion = 0.2
apply_scheduler = True
max_length = 128

# Dados

## Tarefa primária

In [7]:
def process_data(df):
    df['Alteração na eficiência/funcionalidade'] = (df['Alteração na eficiência/funcionalidade'] == 1) | (df["Alteração da funcionalidade"] == 1) | (df["Alteração na eficiência"] == 1)
    df = df.drop(["Postagem com possível perfil depressivo","Alteração na eficiência",
         "Alteração da funcionalidade","*",'Agitação/inquietação','Sintoma obsessivo e compulsivo','Déficit de atenção/Memória',
              'Perda/Diminuição do prazer/ Perda/Diminuição da libido'],axis=1)
    df['Neutro'] =  df.iloc[:,1:].sum(axis=1) == 0
    df = df.replace({True:1,False:0})

    df = df.reset_index(drop=True)
    return df

In [8]:
train_df = process_data(pd.read_csv("data/segredos_sentenças_multitask_train_clean.csv",index_col=0))
test_df = process_data(pd.read_csv("data/segredos_sentenças_multitask_test_clean.csv",index_col=0))


In [9]:

symptom_num = train_df.iloc[:,1:].shape[1]
target_names_primary = list(train_df.iloc[:,1:].columns)

## Tarefa auxiliar

In [10]:
go_emotions_path = "data/goemotions"

auxiliary_train = pd.read_csv(f"{go_emotions_path}/train.tsv",sep='\t')
auxiliary_val = pd.read_csv(f"{go_emotions_path}/dev.tsv",sep='\t')

auxiliary_test = pd.read_csv(f"{go_emotions_path}/test.tsv",sep='\t')


In [11]:
def change_label_encoding(row):
    #mudar encoding para ser multirrótulo (preferível para o cálculo da entropia)
    labels = row['labels'].replace(" ","").split(",")
    new_row = {}
    for emotion in emotion_dict:
        if str(emotion_dict[emotion]) in labels:
            new_row[emotion] = True
        else:
            new_row[emotion] = False
    return new_row

In [12]:
emotion_dict = {
    "admiração": 0,
    "diversão": 1,
    "raiva": 2,
    "aborrecimento": 3,
    "aprovação": 4,
    "zelo": 5,
    "confusão": 6,
    "curiosidade": 7,
    "desejo": 8,
    "decepção": 9,
    "desaprovação": 10,
    "nojo": 11,
    "constrangimento": 12,
    "entusiasmo": 13,
    "medo": 14,
    "gratidão": 15,
    "luto": 16,
    "alegria": 17,
    "amor": 18,
    "nervosismo": 19,
    "otimismo": 20,
    "orgulho": 21,
    "percepção": 22,
    "alívio": 23,
    "remorso": 24,
    "tristeza": 25,
    "surpresa": 26,
    "neutro": 27
}


In [13]:
emotion_labels_test = pd.DataFrame(list(auxiliary_test.apply(change_label_encoding,axis=1)))
emotion_labels_train = pd.DataFrame(list(auxiliary_train.apply(change_label_encoding,axis=1)))
emotion_labels_val = pd.DataFrame(list(auxiliary_val.apply(change_label_encoding,axis=1)))


In [14]:
emotion_num = len(emotion_dict)
target_names_auxiliary = list(auxiliary_train.columns)

# Modelo

In [15]:
class Mixup(torch.nn.Module):
    def __init__(self,mixup_alpha=1):
        
        super(Mixup,self).__init__()
    
        self.mixup_alpha = mixup_alpha
        
        
    def mixup(self,batch_ids,batch_labels,alpha=1):
        '''Returns mixed inputs, pairs of targets, and lambda'''
        if alpha > 0:
            lam = np.random.beta(alpha, alpha)
        else:
            lam = 1

        batch_size = batch_ids.size()[0]

        index = torch.randperm(batch_size).cuda()


        mixed_x = lam * batch_ids + (1 - lam) * batch_ids[index, :]
        #using bigger input mask
        #mixed_masks = batch_masks | batch_masks[index,:]
        y_a, y_b = batch_labels, batch_labels[index]
        return mixed_x, y_a, y_b, lam
    
    def forward(self,outputs,labels):
       
        if self.training:
            outputs, labels_a, labels_b, lamb = self.mixup(outputs,labels,self.mixup_alpha)

            return outputs,labels_a,labels_b,lamb
        else:
            return outputs


In [16]:
class BertSoftSharingLayer(nn.Module):
    def __init__(self,encoders):
        super(BertSoftSharingLayer,self).__init__()

        self.encoders = nn.ModuleList(encoders)
        self.n_tasks = len(encoders)
        #linear combination weights between tasks
        #self.alpha = nn.Parameter(torch.eye(len(encoders),len(encoders),requires_grad=True))
        #self.alpha = nn.Parameter(torch.full((len(encoders),len(encoders)),0.5,requires_grad=True))
        
        self.alpha_prim = nn.Parameter(torch.ones(1,requires_grad=True))
        self.beta_prim = nn.Parameter(torch.zeros(1,requires_grad=True))
        
        self.alpha_aux = nn.Parameter(torch.ones(1,requires_grad=True))
        self.beta_aux = nn.Parameter(torch.zeros(1,requires_grad=True))
        
        self.alphas = [self.alpha_prim,self.alpha_aux]
        self.betas = [self.beta_prim,self.beta_aux]

    def prepare_for_task(self,task_idx):
        #notar que os alphas não foram tocados. Estes são sempre treinados
        for n in range(self.n_tasks):
            if n != task_idx:
                for param in self.encoders[n].parameters():
                    param.requires_grad=False
                    self.alphas[n].requires_grad=False
                    self.betas[n].requires_grad=False
            else:
                for param in self.encoders[n].parameters():
                    param.requires_grad=True
                    self.alphas[n].requires_grad=True
                    self.betas[n].requires_grad=True

        
    def forward(self,input_arr,attention_mask):
        outputs = []
        for i,encoder in enumerate(self.encoders):
            outputs.append(encoder(input_arr[i],attention_mask = attention_mask)[0])
        
        outputs[0] = outputs[0] * self.alpha_prim + outputs[1] * self.beta_prim
        outputs[1] = outputs[1] * self.alpha_aux + outputs[0] * self.beta_aux

        outputs = torch.stack(outputs)

        #linear_combination = torch.matmul(self.alpha,outputs.reshape(self.n_tasks,torch.prod(torch.tensor(outputs.size()[1:]))))
        #return linear_combination.reshape(outputs.size())
        return outputs
        
        

In [17]:
class BertSharedParametersModel(nn.Module):
    def __init__(self,models,attention_n,hidden_size,num_labels,dropout_rate=0.1,mixup_alphas=[1,0],mixup_layer=12):
        super(BertSharedParametersModel,self).__init__()
        self.n_tasks = len(models)
        self.embeddings = nn.ModuleList([model.bert.embeddings for model in models])
        soft_sharing_layers = []
        #assumindo que os modelos têm o mesmo n° de camadas de atenção
        for i in range(attention_n):
            attention_layers = [model.bert.encoder.layer[i] for model in models]
            soft_sharing_layers.append(BertSoftSharingLayer(attention_layers))
        self.soft_sharing_layers = nn.ModuleList(soft_sharing_layers)
        self.poolers = nn.ModuleList([model.bert.pooler for model in models])
        self.dropouts = nn.ModuleList([model.dropout for model in models])
        self.classification_heads = nn.ModuleList([model.classifier for model in models])
        self.mixups = nn.ModuleList([Mixup(alpha) for alpha in mixup_alphas])
        self.mixup_layer = mixup_layer
        for i,_ in enumerate(self.dropouts):
            self.dropouts.p = dropout_rate
            
            
    def prepare_for_task(self,task_idx):
        for n in range(self.n_tasks):
            if n == task_idx:
                for param in self.embeddings[n].parameters():
                    param.requires_grad= True
                
                for i in range (len(self.soft_sharing_layers)):
                    self.soft_sharing_layers[i].prepare_for_task(task_idx)
                
                for param in self.poolers[n].parameters():
                    param.requires_grad= True
                
                for param in self.classification_heads[n].parameters():
                    param_requires_grad = True
            else:
                
                for param in self.embeddings[n].parameters():
                    param.requires_grad= False
                
                for param in self.poolers[n].parameters():
                    param.requires_grad= False
                
                for param in self.classification_heads[n].parameters():
                    param_requires_grad = False

    
    
    def forward(self,input_arr,attention_mask,labels,task_idx):
        
        self.prepare_for_task(task_idx)

        
        
        extended_attention_mask: extended_attention_mask = attention_mask[:, None, None, :]
        extended_attention_mask = extended_attention_mask.to(dtype=torch.float)  
        extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(torch.float).min
        outputs = []
        for i, embedding in enumerate(self.embeddings):
           
            outputs.append(embedding(input_arr))
            
        outputs = torch.stack(outputs)
        

        for i,sharing_layer in enumerate(self.soft_sharing_layers):
            outputs = sharing_layer(outputs,attention_mask=extended_attention_mask)
            if i == (self.mixup_layer-1) and self.training:
                outputs_cur_task,labels_a,labels_b,lamb = self.mixups[task_idx](outputs[task_idx],labels)
                outputs[task_idx] = outputs_cur_task
        outputs = self.poolers[task_idx](outputs[task_idx])
        
        
        outputs = self.dropouts[task_idx](outputs)
            

        outputs = self.classification_heads[task_idx](outputs)
        if self.training:
            return outputs,labels_a,labels_b,lamb
        
        return outputs


In [18]:
#combined_model.prepare_for_task(5)
#for param in combined_model.soft_sharing_layers[2].encoders[0].parameters():
#    print(param.requires_grad)

In [19]:
#primary_model_path = "models/BERT baselines"
primary_model_path = "neuralmind/bert-base-portuguese-cased"
#auxiliary_model_path = "models/go_emotions"
auxiliary_model_path = "neuralmind/bert-base-portuguese-cased"

tokenizer_path = "neuralmind/bert-base-portuguese-cased"

In [20]:
primary_model = BertForSequenceClassification.from_pretrained(primary_model_path,num_labels=symptom_num)
auxiliary_model = BertForSequenceClassification.from_pretrained(auxiliary_model_path,num_labels=emotion_num)

primary_config = AutoConfig.from_pretrained(primary_model_path)
auxiliary_config = AutoConfig.from_pretrained(auxiliary_model_path)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [21]:
assert primary_config.num_hidden_layers == auxiliary_config.num_hidden_layers
combined_model = BertSharedParametersModel([primary_model,auxiliary_model],primary_config.num_hidden_layers,
                                          primary_config.hidden_size,[symptom_num,emotion_num])

In [22]:
summary(combined_model,input_size=(768,),depth=5,batch_dim=2, dtypes=[torch.IntTensor]) 

Layer (type:depth-idx)                                  Param #
├─ModuleList: 1-1                                       --
|    └─BertEmbeddings: 2-1                              --
|    |    └─Embedding: 3-1                              22,881,792
|    |    └─Embedding: 3-2                              393,216
|    |    └─Embedding: 3-3                              1,536
|    |    └─LayerNorm: 3-4                              1,536
|    |    └─Dropout: 3-5                                --
|    └─BertEmbeddings: 2-2                              --
|    |    └─Embedding: 3-6                              22,881,792
|    |    └─Embedding: 3-7                              393,216
|    |    └─Embedding: 3-8                              1,536
|    |    └─LayerNorm: 3-9                              1,536
|    |    └─Dropout: 3-10                               --
├─ModuleList: 1-2                                       --
|    └─BertSoftSharingLayer: 2-3                        --
|    |    └─M

Layer (type:depth-idx)                                  Param #
├─ModuleList: 1-1                                       --
|    └─BertEmbeddings: 2-1                              --
|    |    └─Embedding: 3-1                              22,881,792
|    |    └─Embedding: 3-2                              393,216
|    |    └─Embedding: 3-3                              1,536
|    |    └─LayerNorm: 3-4                              1,536
|    |    └─Dropout: 3-5                                --
|    └─BertEmbeddings: 2-2                              --
|    |    └─Embedding: 3-6                              22,881,792
|    |    └─Embedding: 3-7                              393,216
|    |    └─Embedding: 3-8                              1,536
|    |    └─LayerNorm: 3-9                              1,536
|    |    └─Dropout: 3-10                               --
├─ModuleList: 1-2                                       --
|    └─BertSoftSharingLayer: 2-3                        --
|    |    └─M

# Dataloaders

In [23]:
from torch.utils.data.dataset import ConcatDataset

def get_dataloader(texts,labels,idx,batch_size,shuffle):
    
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(text,return_attention_mask=True,add_special_tokens=True,max_length = max_length,
    padding="max_length",truncation=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    lbs = torch.tensor(labels.to_numpy(),dtype=torch.float32)
    task_idx = torch.zeros(lbs.size()).fill_(idx)
    dataset = TensorDataset(input_ids,attention_masks,lbs,task_idx)
    
    return torch.utils.data.DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle)
    

In [24]:
prim_train_dataloader = get_dataloader(train_df.text,train_df.iloc[:,1:],0,batch_size,shuffle=True)
aux_train_dataloader = get_dataloader(auxiliary_train.text,emotion_labels_train,1,batch_size,shuffle=True)
train_dataloaders = [prim_train_dataloader,aux_train_dataloader]

prim_test_dataloader = get_dataloader(test_df.text,test_df.iloc[:,1:],0,batch_size,shuffle=True)
aux_test_dataloader = get_dataloader(auxiliary_val.text,emotion_labels_val,1,batch_size,shuffle=True)
test_dataloaders = [prim_test_dataloader,aux_test_dataloader]

# Treinamento

In [25]:
def focal_loss(p,targets,gamma=3,alpha=0.8):
    bce = torch.nn.functional.binary_cross_entropy(p,targets,reduction='none')
    p = torch.where(targets == 1,p,1-p)
    alpha = targets * alpha + (1-targets) * (1 - alpha)
    loss = (bce * alpha * (1 - p) ** gamma)
    return loss.mean()

In [26]:
def naive_combination(p_loss,a_loss,alpha=0.5):
    return p_loss * alpha + a_loss * (1-alpha)

In [27]:
def mixup_criterion(criterion_params_a,criterion_params_b,lamb,criterion):
    return criterion(*criterion_params_a) * lamb + criterion(*criterion_params_b) * (1-lamb)

In [28]:
combined_model = combined_model.to(device)

In [None]:
model_vars = [i[1] for i in combined_model.named_parameters() if i[0].find("alpha") == -1]
alpha_vars = [i[1] for i in combined_model.named_parameters() if i[0].find("alpha") != -1]


prim_optimizer = torch.optim.AdamW([{"params":model_vars},{"params":alpha_vars,"lr":0.01}],lr=learning_rate)
sigmoid = torch.sigmoid
aux_optimizer = torch.optim.AdamW([{"params":model_vars},{"params":alpha_vars,"lr":0.01}],lr=learning_rate)


if apply_scheduler:
    num_train_steps = int(len(aux_train_dataloader) * num_train_epochs)
    num_warmup_steps = int(num_train_steps * warmup_proportion)

    prim_scheduler = get_constant_schedule_with_warmup(prim_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
    
    aux_scheduler = get_constant_schedule_with_warmup(aux_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
    

sigmoid = torch.sigmoid


    
loss_func = focal_loss
multitask_loss = naive_combination

best_loss_prim = np.inf

best_auc_prim = 0

best_loss_aux = np.inf

best_auc_aux = 0



def calculate_loss_batch(batch,model,criterion):
    
    input_ids = batch[0].to(device)
    input_masks = batch[1].to(device)
    labels = batch[2].to(device)
    task_idx = int(batch[3][0][0].item())
    
    #print(batch)
    
    if model.training:
        
        loss_f = lambda x,y,z: mixup_criterion(x,y,z,criterion=criterion)

        outputs,labels_a,labels_b,lamb = model(input_ids,input_masks,labels,task_idx)

        return loss_f((sigmoid(outputs),labels_a),(sigmoid(outputs),labels_b),lamb)
    else:
        
        outputs = model(input_ids,input_masks,labels,task_idx)
        return criterion(sigmoid(outputs),labels), outputs,labels
    
    
    
    
    
    



for epoch_i in range(0,num_train_epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')
    t0 = time.time()
    
    tr_loss = 0
    pr_loss = 0
    aux_loss = 0
    
    combined_model.train()
    prim_iter = iter(prim_train_dataloader)
    aux_iter = iter(aux_train_dataloader)
    
    #no caso tô assumindo que o aux é sempre maior, porque esse é meu caso agora
    
    for step,aux_batch in enumerate(aux_iter):
        if step % print_each_n_step == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(aux_train_dataloader), elapsed))
        try:
            prim_batch = prim_iter.__next__()
        except StopIteration:
            prim_iter = iter(prim_train_dataloader)
            prim_batch = prim_iter.__next__()
        
        prim_loss = calculate_loss_batch(prim_batch,combined_model,loss_func)
        
        prim_optimizer.zero_grad()
        prim_loss.backward()
        prim_optimizer.step()
        
        if apply_scheduler:
            prim_scheduler.step()
        
        
        aux_loss = calculate_loss_batch(aux_batch,combined_model,loss_func)
        
        aux_optimizer.zero_grad()
        aux_loss.backward()
        aux_optimizer.step()
        
        if apply_scheduler:
            aux_scheduler.step()
        
        
        
        with torch.no_grad():
            batch_loss = multitask_loss(prim_loss,aux_loss)
        

            
            tr_loss += batch_loss.item()
        
            pr_loss += prim_loss.item()
        
            aux_loss += aux_loss.item()

        
    
    avg_prim_loss = pr_loss/len(aux_train_dataloader)    
    avg_train_loss = tr_loss/len(aux_train_dataloader)
    avg_aux_loss = aux_loss/len(aux_train_dataloader)    

    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {:}".format(avg_train_loss))
    print("  Average primary loss: {:}".format(avg_prim_loss))
    print("  Average aux loss: {:}".format(avg_aux_loss))

    print("  Training epoch took: {:}".format(training_time))
    
    
    print("  Testing primary...")
    combined_model.eval()
    
    pr_loss = 0
    tt_loss = 0
    aux_loss = 0
    
    all_probs = []
    all_labels_ids = []
    
    for prim_batch in prim_test_dataloader:
        with torch.no_grad():
            prim_loss,outputs,labels = calculate_loss_batch(prim_batch,combined_model,loss_func)
            probs = sigmoid(outputs)
            pr_loss += prim_loss.item()
            
        all_probs += probs.detach().cpu()
        all_labels_ids += labels.detach().cpu()
        
    all_probs = torch.stack(all_probs).numpy()
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    
    cur_auc = average_precision_score(all_labels_ids,all_probs)

    
    avg_prim_test_loss = pr_loss/len(prim_test_dataloader)
    
    if cur_auc > best_auc_prim:
        best_auc_prim = cur_auc
        
    if avg_prim_test_loss < best_loss_prim:
        best_loss = avg_prim_test_loss
        best_probs_prim = all_probs
        best_labels_prim = all_labels_ids
        
    print("  AUC primária:",cur_auc)
    print("  Average test primary loss: {:}".format(avg_prim_test_loss))

    
    
    print("  Testing auxiliary...")
    

    
    aux_loss = 0
    
    all_probs = []
    all_labels_ids = []
    
    for prim_batch in aux_test_dataloader:
        with torch.no_grad():
            prim_loss,outputs,labels = calculate_loss_batch(prim_batch,combined_model,loss_func)
            probs = sigmoid(outputs)
            aux_loss += prim_loss.item()
            
        all_probs += probs.detach().cpu()
        all_labels_ids += labels.detach().cpu()
        
    all_probs = torch.stack(all_probs).numpy()
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    
    cur_auc = average_precision_score(all_labels_ids,all_probs)

    
    avg_aux_test_loss = aux_loss/len(prim_test_dataloader)  
    
    if cur_auc > best_auc_aux:
        best_auc_aux = cur_auc
        
    if avg_aux_test_loss < best_loss_aux:
        best_loss = avg_aux_test_loss
        best_probs_aux = all_probs
        best_labels_aux = all_labels_ids
        
    print("  AUC auxiliar:",cur_auc)
    avg_test_loss = multitask_loss(avg_prim_test_loss,avg_aux_test_loss)
    
    print("  Average test loss: {:}".format(avg_test_loss))
    print("  Average test aux loss: {:}".format(avg_aux_test_loss))
    
    
    
    


Training...
  Batch   100  of  2,714.    Elapsed: 0:00:54.
  Batch   200  of  2,714.    Elapsed: 0:01:48.
  Batch   300  of  2,714.    Elapsed: 0:02:42.
  Batch   400  of  2,714.    Elapsed: 0:03:36.
  Batch   500  of  2,714.    Elapsed: 0:04:30.
  Batch   600  of  2,714.    Elapsed: 0:05:25.
  Batch   700  of  2,714.    Elapsed: 0:06:19.
  Batch   800  of  2,714.    Elapsed: 0:07:14.
  Batch   900  of  2,714.    Elapsed: 0:08:09.
  Batch 1,000  of  2,714.    Elapsed: 0:09:03.
  Batch 1,100  of  2,714.    Elapsed: 0:09:58.
  Batch 1,200  of  2,714.    Elapsed: 0:10:52.
  Batch 1,300  of  2,714.    Elapsed: 0:11:47.
  Batch 1,400  of  2,714.    Elapsed: 0:12:42.
  Batch 1,500  of  2,714.    Elapsed: 0:13:36.
  Batch 1,600  of  2,714.    Elapsed: 0:14:31.
  Batch 1,700  of  2,714.    Elapsed: 0:15:26.
  Batch 1,800  of  2,714.    Elapsed: 0:16:21.
  Batch 1,900  of  2,714.    Elapsed: 0:17:15.
  Batch 2,000  of  2,714.    Elapsed: 0:18:10.
  Batch 2,100  of  2,714.    Elapsed: 0:19:05.


In [None]:
for sharing_layer in combined_model.soft_sharing_layers:
    print(sharing_layer.alpha)