# **Set-Up**

In [None]:
# Links Referred

# 1: LSTM: https://www.kaggle.com/code/madz2000/nlp-using-glove-embeddings-99-87-accuracy
# 2: BERT and RoBERTa: https://www.kaggle.com/code/jaskaransingh/fake-news-classification-bert-roberta
# 3: Data: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Display Properties
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
# Set Working Directory
%cd drive/MyDrive/DLProject

/content/drive/MyDrive/DLProject


In [None]:
# Install Transformers
!pip install transformers

In [None]:
# General Imports
import string
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

# Torch Imports
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [None]:
# BERT and RoBERTa Imports
from transformers import (BertForSequenceClassification,
                          BertTokenizer,
                          RobertaForSequenceClassification,
                          RobertaTokenizer,
                          AdamW)

In [None]:
# Use GPU if Available
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
# Load Dataset
true_df = pd.read_csv("data/True.csv")
false_df = pd.read_csv("data/Fake.csv")

# Setting Target Column
true_df['target'] = 1
false_df['target'] = 0

# Merging Dataframes
news_df = pd.concat([true_df,false_df])

In [None]:
# Get Stop Words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Data Cleaning Functions
def strip_html(text: str):
  parsed_text = BeautifulSoup(text, 'html.parser')
  text = parsed_text.get_text()
  return text

# Remove Stop Words
def remove_stop_words(text: str):
  text_list = [word for word in text.split() if word.lower() not in stop_words]
  text = " ".join(text_list)
  return text

# Remove Bias
def remove_bias(text: str):
  pattern = '^.*\(Reuters\)'
  text = re.sub(pattern, '', text)
  return text

# Function encapulating all Data Cleaning Functions
def clean_text(text):
  text = strip_html(text)
  text = remove_stop_words(text)
  text = remove_bias(text)
  return text

In [None]:
# Apply Data Cleaning to Text and Title
news_df['cleaned_text'] = news_df['text'].apply(clean_text)
news_df['cleaned_title'] = news_df['title'].apply(clean_text)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

In [None]:
# Append Title with Text
# news_df['full_text'] = news_df['cleaned_title'] + " " + news_df['cleaned_text']

In [None]:
# Append Title with Text
news_df['full_text'] = news_df['cleaned_title']

In [None]:
# BERT Model
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
                                                           num_labels=2, # The number of output labels (2) for binary classification.
                                                           output_attentions=False, # Whether the model returns attentions weights.
                                                           output_hidden_states=False # Whether the model returns all hidden-states.
                                                          )

# BERT Tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tell pytorch to run this model on the GPU.
bert_model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# RoBERTa Model
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", # 12-layer, 768-hidden, 12-heads, 125M parameters RoBERTa using the BERT-base architecture
                                                                    num_labels=2, # The number of output labels (2) for binary classification.
                                                                    output_attentions=False, # Whether the model returns attentions weights.
                                                                    output_hidden_states=False # Whether the model returns all hidden-states.
                                                                )
# RoBERTa Tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
# Print the original sentence.
print(' Original: ', news_df['full_text'][0].values[0])

# Print the tweet split into tokens.
print('Tokenized BERT: ', bert_tokenizer.tokenize(news_df['full_text'][0].values[0]))

# Print the tweet mapped to token ids.
print('Token IDs BERT: ', bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize(news_df['full_text'][0].values[0])))

# Print the tweet split into tokens.
print('Tokenized RoBERT: ', roberta_tokenizer.tokenize(news_df['full_text'][0].values[0]))

# Print the tweet mapped to token ids.
print('Token IDs RoBERTa: ', roberta_tokenizer.convert_tokens_to_ids(roberta_tokenizer.tokenize(news_df['full_text'][0].values[0])))

 Original:  U.S. budget fight looms, Republicans flip fiscal script
Tokenized BERT:  ['u', '.', 's', '.', 'budget', 'fight', 'lo', '##oms', ',', 'republicans', 'flip', 'fiscal', 'script']
Token IDs BERT:  [1057, 1012, 1055, 1012, 5166, 2954, 8840, 22225, 1010, 10643, 11238, 10807, 5896]
Tokenized RoBERT:  ['U', '.', 'S', '.', 'Ġbudget', 'Ġfight', 'Ġlooms', ',', 'ĠRepublicans', 'Ġflip', 'Ġfiscal', 'Ġscript']
Token IDs RoBERTa:  [791, 4, 104, 4, 1229, 1032, 26165, 6, 1858, 11113, 2358, 8543]


In [None]:
max_len_bert = 0
max_len_roberta = 0

# For every sentence...
for sentence in news_df.full_text.values:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids_bert = bert_tokenizer.encode(sentence, add_special_tokens=True)
    input_ids_roberta = roberta_tokenizer.encode(sentence, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len_bert = max(max_len_bert, len(input_ids_bert))
    max_len_roberta = max(max_len_roberta, len(input_ids_roberta))

    
print('Max sentence length BERT: ', max_len_bert)
print('Max sentence length RoBERTa: ', max_len_roberta)

Max sentence length BERT:  59
Max sentence length RoBERTa:  74


In [None]:
# Tokenize all of the sentences and map the tokens to their word IDs.
bert_input_ids = []
bert_attention_masks = []
roberta_input_ids = []
roberta_attention_masks = []
sentence_ids = []
counter = 0

# For every sentence...
for sent in news_df.full_text.values:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    bert_encoded_dict = bert_tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 120,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    
    roberta_encoded_dict = roberta_tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 120,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.    
    bert_input_ids.append(bert_encoded_dict['input_ids'])
    roberta_input_ids.append(roberta_encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    bert_attention_masks.append(bert_encoded_dict['attention_mask'])
    roberta_attention_masks.append(roberta_encoded_dict['attention_mask'])
    
    # collecting sentence_ids
    sentence_ids.append(counter)
    counter  = counter + 1
    
    
    
# Convert the lists into tensors.
bert_input_ids = torch.cat(bert_input_ids, dim=0)
bert_attention_masks = torch.cat(bert_attention_masks, dim=0)

roberta_input_ids = torch.cat(roberta_input_ids, dim=0)
roberta_attention_masks = torch.cat(roberta_attention_masks, dim=0)

labels = torch.tensor(news_df.target.values)
sentence_ids = torch.tensor(sentence_ids)

# Print sentence 0, now as a list of IDs.
print('Original: ', news_df.full_text.values[1])
print('Token IDs BERT:', bert_input_ids[1])
print('Token IDs RoBERTa:', roberta_input_ids[1])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  U.S. military accept transgender recruits Monday: Pentagon
Token IDs BERT: tensor([  101,  1057,  1012,  1055,  1012,  2510,  5138, 16824, 15024,  6928,
         1024, 20864,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,

In [None]:
from torch.utils.data import TensorDataset, random_split
# function to seed the script globally
torch.manual_seed(0)

# Combine the training inputs into a TensorDataset.
bert_dataset = TensorDataset(sentence_ids, bert_input_ids, bert_attention_masks, labels)
roberta_dataset = TensorDataset(roberta_input_ids, roberta_attention_masks, labels)

<torch._C.Generator at 0x7f8507b540b0>

In [None]:
# function to remove sentice ids from the tensor dataset post train test split
def index_remover(tensordata):
    input_ids = []
    attention_masks = []
    labels = []
   
    for a,b,c,d in tensordata:
        input_ids.append(b.tolist())
        attention_masks.append(c.tolist())
        labels.append(d.tolist())
        
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    
    final_dataset =  TensorDataset(input_ids, attention_masks, labels)
    return final_dataset
        
# check
trial_dataset =  index_remover(bert_dataset)
trial_dataset[0]
# yes we were able to remove the sentence id from the data without disturbing the data format

(tensor([  101,  1057,  1012,  1055,  1012,  5166,  2954,  8840, 22225,  1010,
         10643, 11238, 10807,  5896,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
# Create a 70-20-10 train-val-test split.
train_size = int(0.7 * len(bert_dataset))
rem_size = len(bert_dataset) - train_size

val_size = int(0.66 * rem_size)
test_size = rem_size - val_size

# Divide the dataset by randomly selecting samples.
bert_train_dataset, bert_rem_dataset = random_split(bert_dataset, [train_size, rem_size])
bert_val_dataset, bert_test_dataset = random_split(bert_rem_dataset, [val_size, test_size])

# Divide the dataset by randomly selecting samples.
roberta_train_dataset, roberta_rem_dataset = random_split(roberta_dataset, [train_size, rem_size])
roberta_val_dataset, roberta_test_dataset = random_split(roberta_rem_dataset, [val_size, test_size])

In [None]:
# Checking whether the distribution of target is consitent across both the sets
sentence_ids_list_valid = []
for a,b,c,d in bert_val_dataset:
  sentence_ids_list_valid.append(a.tolist())

# Removing sentence ids from tensor dataset so that it can be used for training 
bert_train_dataset = index_remover(bert_train_dataset)
bert_val_dataset = index_remover(bert_val_dataset)

# Checking whether the distribution of target is consitent across both the sets
label_temp_list = []
for a,b,c in bert_train_dataset:
  label_temp_list.append(c)

print('{:>5,} training samples'.format(train_size))

label_temp_list = []
for a,b,c in bert_val_dataset:
  label_temp_list.append(c)

print('{:>5,} validation samples'.format(val_size))

31,428 training samples
8,890 validation samples


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
bert_train_dataloader = DataLoader(
            bert_train_dataset,  # The training samples.
            sampler = RandomSampler(bert_train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
roberta_train_dataloader = DataLoader(
            roberta_train_dataset,  # The training samples.
            sampler = RandomSampler(roberta_train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
bert_validation_dataloader = DataLoader(
            bert_val_dataset, # The validation samples.
            sampler = SequentialSampler(bert_val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

roberta_validation_dataloader = DataLoader(
            roberta_val_dataset, # The validation samples.
            sampler = SequentialSampler(roberta_val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# BERT Test Data Loader
bert_test_dataloader = DataLoader(
            bert_test_dataset, # The validation samples.
            sampler = SequentialSampler(bert_test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# RoBERTa Test Data Loader
roberta_test_dataloader = DataLoader(
            roberta_test_dataset, # The validation samples.
            sampler = SequentialSampler(roberta_test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# Get all of the roberta_model's parameters as a list of tuples.
params = list(roberta_model.named_parameters())

print('The RoBERTa model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The RoBERTa model has 201 different named parameters.

==== Embedding Layer ====

roberta.embeddings.word_embeddings.weight               (50265, 768)
roberta.embeddings.position_embeddings.weight             (514, 768)
roberta.embeddings.token_type_embeddings.weight             (1, 768)
roberta.embeddings.LayerNorm.weight                           (768,)
roberta.embeddings.LayerNorm.bias                             (768,)

==== First Transformer ====

roberta.encoder.layer.0.attention.self.query.weight       (768, 768)
roberta.encoder.layer.0.attention.self.query.bias             (768,)
roberta.encoder.layer.0.attention.self.key.weight         (768, 768)
roberta.encoder.layer.0.attention.self.key.bias               (768,)
roberta.encoder.layer.0.attention.self.value.weight       (768, 768)
roberta.encoder.layer.0.attention.self.value.bias             (768,)
roberta.encoder.layer.0.attention.output.dense.weight     (768, 768)
roberta.encoder.layer.0.attention.output.dense.bias         

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
bert_optimizer = AdamW(
    bert_model.parameters(),
    lr = 5e-5, # args.learning_rate - default is 5e-5
    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
)

roberta_optimizer = AdamW(roberta_model.parameters(),
    lr = 5e-5, # args.learning_rate - default is 5e-5
    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
)



In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 2,I have already seen that the model starts overfitting beyound 2 epochs
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(bert_train_dataloader) * epochs

# Create the learning rate scheduler.
bert_scheduler = get_linear_schedule_with_warmup(bert_optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

roberta_scheduler = get_linear_schedule_with_warmup(roberta_optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# ***Training Phase***

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 100

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
bert_training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the bert_model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-bert_model-train-do-in-pytorch)
    bert_model.train()

    # For each batch of training data...
    for step, batch in enumerate(bert_train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(bert_train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        bert_model.zero_grad()        

        # Perform a forward pass (evaluate the bert_model on this training batch).
        # The documentation for this `bert_model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/bert_model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # are given and what flags are set. For our usage here, it returns
        # the loss (because we provided labels) and the "logits"--the bert_model
        # outputs prior to activation.


        # (loss, logits) = 
        
        t_model_output = bert_model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        # print(model_output)
        loss = t_model_output[0]
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The bert_optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        bert_optimizer.step()

        # Update the learning rate.
        bert_scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(bert_train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the bert_model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    bert_model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in bert_validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # Get the "logits" output by the bert_model. The "logits" are the output
            # values prior to applying an activation function like the softmax.

            val_model_output = bert_model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)

        loss = val_model_output[0]
        logits = val_model_output[1]
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(bert_validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(bert_validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    bert_training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

<torch._C.Generator at 0x7f8507b540b0>


Training...


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

tensor(3.2400, device='cuda:0')

tensor(2.4794, device='cuda:0')

tensor(4.1267, device='cuda:0')

tensor(4.3752, device='cuda:0')

tensor(6.1359, device='cuda:0')

tensor(3.4778, device='cuda:0')

tensor(3.6162, device='cuda:0')

tensor(3.9534, device='cuda:0')

tensor(5.3718, device='cuda:0')

tensor(4.1586, device='cuda:0')

tensor(6.0873, device='cuda:0')

tensor(9.7997, device='cuda:0')

tensor(6.1966, device='cuda:0')

tensor(3.5015, device='cuda:0')

tensor(8.0948, device='cuda:0')

tensor(7.6973, device='cuda:0')

tensor(5.8592, device='cuda:0')

tensor(2.7080, device='cuda:0')

tensor(5.0034, device='cuda:0')

tensor(7.3653, device='cuda:0')

tensor(8.2597, device='cuda:0')

tensor(11.5239, device='cuda:0')

tensor(1.4894, device='cuda:0')

tensor(5.0340, device='cuda:0')

tensor(6.0790, device='cuda:0')

tensor(3.8485, device='cuda:0')

tensor(6.1499, device='cuda:0')

tensor(4.3985, device='cuda:0')

tensor(5.9221, device='cuda:0')

tensor(7.3831, device='cuda:0')

tensor(11.6291, device='cuda:0')

tensor(6.1486, device='cuda:0')

tensor(6.9631, device='cuda:0')

tensor(2.7956, device='cuda:0')

tensor(3.4766, device='cuda:0')

tensor(2.5850, device='cuda:0')

tensor(4.4450, device='cuda:0')

tensor(4.6147, device='cuda:0')

tensor(4.3896, device='cuda:0')

tensor(2.2217, device='cuda:0')

  Batch    40  of    983.    Elapsed: 0:00:48.


tensor(11.5375, device='cuda:0')

tensor(2.3714, device='cuda:0')

tensor(11.2926, device='cuda:0')

tensor(17.3651, device='cuda:0')

tensor(7.8712, device='cuda:0')

tensor(7.4736, device='cuda:0')

tensor(3.4259, device='cuda:0')

tensor(5.1217, device='cuda:0')

tensor(8.0357, device='cuda:0')

tensor(4.4808, device='cuda:0')

tensor(7.5073, device='cuda:0')

tensor(2.1968, device='cuda:0')

tensor(3.6280, device='cuda:0')

tensor(2.6823, device='cuda:0')

tensor(10.0943, device='cuda:0')

tensor(3.9863, device='cuda:0')

tensor(1.7058, device='cuda:0')

tensor(9.3434, device='cuda:0')

tensor(14.0958, device='cuda:0')

tensor(7.3314, device='cuda:0')

tensor(3.4126, device='cuda:0')

tensor(11.0026, device='cuda:0')

tensor(7.7959, device='cuda:0')

tensor(8.0593, device='cuda:0')

tensor(9.6514, device='cuda:0')

tensor(5.6897, device='cuda:0')

tensor(9.4871, device='cuda:0')

tensor(8.3508, device='cuda:0')

tensor(7.8745, device='cuda:0')

tensor(11.6050, device='cuda:0')

tensor(6.7505, device='cuda:0')

tensor(4.4802, device='cuda:0')

tensor(2.5215, device='cuda:0')

tensor(5.1218, device='cuda:0')

tensor(4.8022, device='cuda:0')

tensor(4.5457, device='cuda:0')

tensor(2.3582, device='cuda:0')

tensor(7.2600, device='cuda:0')

tensor(1.7975, device='cuda:0')

tensor(5.1662, device='cuda:0')

  Batch    80  of    983.    Elapsed: 0:01:36.


tensor(8.0342, device='cuda:0')

tensor(8.9937, device='cuda:0')

tensor(4.3921, device='cuda:0')

tensor(5.5870, device='cuda:0')

tensor(4.4686, device='cuda:0')

tensor(8.1051, device='cuda:0')

tensor(12.0411, device='cuda:0')

tensor(3.5553, device='cuda:0')

tensor(4.8925, device='cuda:0')

tensor(2.7738, device='cuda:0')

tensor(5.5567, device='cuda:0')

tensor(5.5062, device='cuda:0')

tensor(4.1080, device='cuda:0')

tensor(4.8244, device='cuda:0')

tensor(3.1278, device='cuda:0')

tensor(1.1501, device='cuda:0')

tensor(2.2432, device='cuda:0')

tensor(0.4976, device='cuda:0')

tensor(5.8460, device='cuda:0')

tensor(6.5750, device='cuda:0')

tensor(3.0492, device='cuda:0')

tensor(6.5392, device='cuda:0')

tensor(6.2351, device='cuda:0')

tensor(3.8317, device='cuda:0')

tensor(8.8076, device='cuda:0')

tensor(0.7316, device='cuda:0')

tensor(2.7379, device='cuda:0')

tensor(9.3225, device='cuda:0')

tensor(2.5347, device='cuda:0')

tensor(4.6246, device='cuda:0')

tensor(2.4348, device='cuda:0')

tensor(6.1241, device='cuda:0')

tensor(4.4741, device='cuda:0')

tensor(7.7417, device='cuda:0')

tensor(2.7379, device='cuda:0')

tensor(4.9929, device='cuda:0')

tensor(5.9323, device='cuda:0')

tensor(4.5222, device='cuda:0')

tensor(2.6713, device='cuda:0')

tensor(3.5021, device='cuda:0')

  Batch   120  of    983.    Elapsed: 0:02:24.


tensor(6.8293, device='cuda:0')

tensor(1.4389, device='cuda:0')

tensor(1.1746, device='cuda:0')

tensor(6.9723, device='cuda:0')

tensor(4.2658, device='cuda:0')

tensor(6.0401, device='cuda:0')

tensor(0.3880, device='cuda:0')

tensor(0.4566, device='cuda:0')

tensor(8.2623, device='cuda:0')

tensor(1.7087, device='cuda:0')

tensor(7.9776, device='cuda:0')

tensor(9.4405, device='cuda:0')

tensor(4.5076, device='cuda:0')

tensor(11.3160, device='cuda:0')

tensor(11.1205, device='cuda:0')

tensor(11.3264, device='cuda:0')

tensor(3.5372, device='cuda:0')

tensor(10.4499, device='cuda:0')

tensor(5.4755, device='cuda:0')

tensor(1.5996, device='cuda:0')

tensor(1.7592, device='cuda:0')

tensor(7.2320, device='cuda:0')

tensor(2.6270, device='cuda:0')

tensor(3.9460, device='cuda:0')

tensor(0.5042, device='cuda:0')

tensor(5.9452, device='cuda:0')

tensor(4.8314, device='cuda:0')

tensor(3.3953, device='cuda:0')

tensor(9.0197, device='cuda:0')

tensor(8.0767, device='cuda:0')

tensor(7.2251, device='cuda:0')

tensor(4.7017, device='cuda:0')

tensor(0.7363, device='cuda:0')

tensor(10.3133, device='cuda:0')

tensor(7.2173, device='cuda:0')

tensor(8.4295, device='cuda:0')

tensor(5.0202, device='cuda:0')

tensor(4.3799, device='cuda:0')

tensor(10.0808, device='cuda:0')

tensor(8.8932, device='cuda:0')

  Batch   160  of    983.    Elapsed: 0:03:12.


tensor(6.7979, device='cuda:0')

tensor(5.8894, device='cuda:0')

tensor(9.2534, device='cuda:0')

tensor(3.6969, device='cuda:0')

tensor(3.0631, device='cuda:0')

tensor(5.1656, device='cuda:0')

tensor(0.7080, device='cuda:0')

tensor(4.8256, device='cuda:0')

tensor(5.6435, device='cuda:0')

tensor(1.1253, device='cuda:0')

tensor(2.5851, device='cuda:0')

tensor(2.3581, device='cuda:0')

tensor(3.6806, device='cuda:0')

tensor(3.5841, device='cuda:0')

tensor(5.0210, device='cuda:0')

tensor(0.2363, device='cuda:0')

tensor(7.8869, device='cuda:0')

tensor(10.4640, device='cuda:0')

tensor(3.8218, device='cuda:0')

tensor(1.7070, device='cuda:0')

tensor(2.3121, device='cuda:0')

tensor(2.6649, device='cuda:0')

tensor(7.8685, device='cuda:0')

tensor(6.1159, device='cuda:0')

tensor(7.1992, device='cuda:0')

tensor(5.4722, device='cuda:0')

tensor(0.5114, device='cuda:0')

tensor(2.2580, device='cuda:0')

tensor(1.6738, device='cuda:0')

tensor(1.5538, device='cuda:0')

tensor(3.2220, device='cuda:0')

tensor(4.3193, device='cuda:0')

tensor(6.4358, device='cuda:0')

tensor(0.7724, device='cuda:0')

tensor(4.9302, device='cuda:0')

tensor(5.1258, device='cuda:0')

tensor(4.2508, device='cuda:0')

tensor(0.2685, device='cuda:0')

tensor(3.2136, device='cuda:0')

tensor(3.0641, device='cuda:0')

  Batch   200  of    983.    Elapsed: 0:04:00.


tensor(6.8448, device='cuda:0')

tensor(8.8841, device='cuda:0')

tensor(5.7661, device='cuda:0')

tensor(0.6321, device='cuda:0')

tensor(3.7959, device='cuda:0')

tensor(6.0079, device='cuda:0')

tensor(11.6779, device='cuda:0')

tensor(4.8912, device='cuda:0')

tensor(8.5083, device='cuda:0')

tensor(7.4764, device='cuda:0')

tensor(1.0209, device='cuda:0')

tensor(2.9185, device='cuda:0')

tensor(4.4103, device='cuda:0')

tensor(4.7421, device='cuda:0')

tensor(1.9145, device='cuda:0')

tensor(6.7598, device='cuda:0')

tensor(5.1288, device='cuda:0')

tensor(7.4703, device='cuda:0')

tensor(5.5877, device='cuda:0')

tensor(9.2107, device='cuda:0')

tensor(5.7659, device='cuda:0')

tensor(1.3648, device='cuda:0')

tensor(17.4058, device='cuda:0')

tensor(2.6522, device='cuda:0')

tensor(2.9943, device='cuda:0')

tensor(1.5259, device='cuda:0')

tensor(5.4080, device='cuda:0')

tensor(1.3305, device='cuda:0')

tensor(0.6710, device='cuda:0')

tensor(0.0951, device='cuda:0')

tensor(2.4863, device='cuda:0')

tensor(6.5683, device='cuda:0')

tensor(3.7414, device='cuda:0')

tensor(1.2229, device='cuda:0')

tensor(5.4214, device='cuda:0')

tensor(4.0240, device='cuda:0')

tensor(1.4216, device='cuda:0')

tensor(2.7673, device='cuda:0')

tensor(5.2867, device='cuda:0')

tensor(4.6184, device='cuda:0')

  Batch   240  of    983.    Elapsed: 0:04:48.


tensor(1.3611, device='cuda:0')

tensor(4.4253, device='cuda:0')

tensor(14.2064, device='cuda:0')

tensor(4.3546, device='cuda:0')

tensor(2.2949, device='cuda:0')

tensor(0.9315, device='cuda:0')

tensor(3.1875, device='cuda:0')

tensor(0.8627, device='cuda:0')

tensor(8.6549, device='cuda:0')

tensor(5.8736, device='cuda:0')

tensor(6.1037, device='cuda:0')

tensor(8.5920, device='cuda:0')

tensor(7.8566, device='cuda:0')

tensor(2.9329, device='cuda:0')

tensor(4.0270, device='cuda:0')

tensor(3.4833, device='cuda:0')

tensor(0.5006, device='cuda:0')

tensor(3.5000, device='cuda:0')

tensor(2.7779, device='cuda:0')

tensor(4.0111, device='cuda:0')

tensor(2.8761, device='cuda:0')

tensor(1.4664, device='cuda:0')

tensor(3.7859, device='cuda:0')

tensor(1.9107, device='cuda:0')

tensor(0.6550, device='cuda:0')

tensor(2.9541, device='cuda:0')

tensor(6.1852, device='cuda:0')

tensor(2.8542, device='cuda:0')

tensor(2.9395, device='cuda:0')

tensor(0.3573, device='cuda:0')

tensor(0.3102, device='cuda:0')

tensor(3.9401, device='cuda:0')

tensor(4.4985, device='cuda:0')

tensor(6.6824, device='cuda:0')

tensor(9.0034, device='cuda:0')

tensor(7.3744, device='cuda:0')

tensor(4.1307, device='cuda:0')

tensor(0.3739, device='cuda:0')

tensor(0.5584, device='cuda:0')

tensor(0.0521, device='cuda:0')

  Batch   280  of    983.    Elapsed: 0:05:37.


tensor(4.2624, device='cuda:0')

tensor(4.6727, device='cuda:0')

tensor(5.3964, device='cuda:0')

tensor(7.3233, device='cuda:0')

tensor(4.0487, device='cuda:0')

tensor(5.8886, device='cuda:0')

tensor(7.6537, device='cuda:0')

tensor(1.2343, device='cuda:0')

tensor(5.1000, device='cuda:0')

tensor(5.0987, device='cuda:0')

tensor(0.1756, device='cuda:0')

tensor(10.8908, device='cuda:0')

tensor(2.9964, device='cuda:0')

tensor(4.0243, device='cuda:0')

tensor(2.7965, device='cuda:0')

tensor(0.7818, device='cuda:0')

tensor(4.7794, device='cuda:0')

tensor(4.6913, device='cuda:0')

tensor(5.0795, device='cuda:0')

tensor(3.8797, device='cuda:0')

tensor(4.7067, device='cuda:0')

tensor(1.3255, device='cuda:0')

tensor(3.9754, device='cuda:0')

tensor(4.4223, device='cuda:0')

tensor(7.1531, device='cuda:0')

tensor(5.4608, device='cuda:0')

tensor(4.6213, device='cuda:0')

tensor(7.3365, device='cuda:0')

tensor(0.6658, device='cuda:0')

tensor(1.8413, device='cuda:0')

tensor(3.1143, device='cuda:0')

tensor(3.1249, device='cuda:0')

tensor(6.7928, device='cuda:0')

tensor(3.5894, device='cuda:0')

tensor(1.0147, device='cuda:0')

tensor(1.1016, device='cuda:0')

tensor(4.1985, device='cuda:0')

tensor(0.0806, device='cuda:0')

tensor(0.2068, device='cuda:0')

tensor(0.1510, device='cuda:0')

  Batch   320  of    983.    Elapsed: 0:06:25.


tensor(2.7905, device='cuda:0')

tensor(0.5631, device='cuda:0')

tensor(12.1859, device='cuda:0')

tensor(5.2999, device='cuda:0')

tensor(11.0909, device='cuda:0')

tensor(0.3382, device='cuda:0')

tensor(7.3494, device='cuda:0')

tensor(13.0253, device='cuda:0')

tensor(0.5792, device='cuda:0')

tensor(0.6679, device='cuda:0')

tensor(4.4006, device='cuda:0')

tensor(2.7775, device='cuda:0')

tensor(5.6745, device='cuda:0')

tensor(2.9509, device='cuda:0')

tensor(0.6307, device='cuda:0')

tensor(6.8697, device='cuda:0')

tensor(8.1161, device='cuda:0')

tensor(8.6242, device='cuda:0')

tensor(15.5565, device='cuda:0')

tensor(7.4491, device='cuda:0')

tensor(5.9529, device='cuda:0')

tensor(4.9377, device='cuda:0')

tensor(6.1097, device='cuda:0')

tensor(3.8005, device='cuda:0')

tensor(10.5559, device='cuda:0')

tensor(7.2414, device='cuda:0')

tensor(6.3452, device='cuda:0')

tensor(0.4985, device='cuda:0')

tensor(4.4116, device='cuda:0')

tensor(5.3156, device='cuda:0')

tensor(0.3311, device='cuda:0')

tensor(0.8700, device='cuda:0')

tensor(11.6097, device='cuda:0')

tensor(4.1781, device='cuda:0')

tensor(0.4916, device='cuda:0')

tensor(7.7721, device='cuda:0')

tensor(0.5988, device='cuda:0')

tensor(0.1911, device='cuda:0')

tensor(0.7270, device='cuda:0')

tensor(0.3964, device='cuda:0')

  Batch   360  of    983.    Elapsed: 0:07:13.


tensor(4.2574, device='cuda:0')

tensor(0.0321, device='cuda:0')

tensor(8.0237, device='cuda:0')

tensor(5.0053, device='cuda:0')

tensor(0.4811, device='cuda:0')

tensor(4.1540, device='cuda:0')

tensor(7.9925, device='cuda:0')

tensor(0.7531, device='cuda:0')

tensor(0.0437, device='cuda:0')

tensor(0.4841, device='cuda:0')

tensor(5.1724, device='cuda:0')

tensor(6.1695, device='cuda:0')

tensor(12.5908, device='cuda:0')

tensor(3.6837, device='cuda:0')

tensor(0.0407, device='cuda:0')

tensor(4.3967, device='cuda:0')

tensor(5.7489, device='cuda:0')

tensor(2.8278, device='cuda:0')

tensor(0.7445, device='cuda:0')

tensor(1.9765, device='cuda:0')

tensor(0.7705, device='cuda:0')

tensor(6.8323, device='cuda:0')

tensor(2.7867, device='cuda:0')

tensor(7.6719, device='cuda:0')

tensor(4.8477, device='cuda:0')

tensor(0.8041, device='cuda:0')

tensor(4.2260, device='cuda:0')

tensor(4.4730, device='cuda:0')

tensor(3.3321, device='cuda:0')

tensor(1.9816, device='cuda:0')

tensor(2.5739, device='cuda:0')

tensor(3.1727, device='cuda:0')

tensor(1.1578, device='cuda:0')

tensor(1.3212, device='cuda:0')

tensor(0.8430, device='cuda:0')

tensor(6.8473, device='cuda:0')

tensor(14.4312, device='cuda:0')

tensor(5.1367, device='cuda:0')

tensor(2.8093, device='cuda:0')

tensor(5.6702, device='cuda:0')

  Batch   400  of    983.    Elapsed: 0:08:01.


tensor(8.1922, device='cuda:0')

tensor(5.8945, device='cuda:0')

tensor(0.1608, device='cuda:0')

tensor(5.4874, device='cuda:0')

tensor(3.4074, device='cuda:0')

tensor(3.7221, device='cuda:0')

tensor(0.2863, device='cuda:0')

tensor(11.3396, device='cuda:0')

tensor(1.8323, device='cuda:0')

tensor(3.0036, device='cuda:0')

tensor(4.3483, device='cuda:0')

tensor(4.6409, device='cuda:0')

tensor(6.9662, device='cuda:0')

tensor(0.1329, device='cuda:0')

tensor(8.1191, device='cuda:0')

tensor(0.2122, device='cuda:0')

tensor(0.4028, device='cuda:0')

tensor(4.1051, device='cuda:0')

tensor(8.3983, device='cuda:0')

tensor(7.6426, device='cuda:0')

tensor(6.5513, device='cuda:0')

tensor(0.7377, device='cuda:0')

tensor(3.1497, device='cuda:0')

tensor(0.8028, device='cuda:0')

tensor(0.6363, device='cuda:0')

tensor(4.0597, device='cuda:0')

tensor(5.9327, device='cuda:0')

tensor(8.3752, device='cuda:0')

tensor(0.8299, device='cuda:0')

tensor(0.5989, device='cuda:0')

tensor(0.9255, device='cuda:0')

tensor(2.3415, device='cuda:0')

tensor(3.2363, device='cuda:0')

tensor(3.8668, device='cuda:0')

tensor(4.6859, device='cuda:0')

tensor(1.2178, device='cuda:0')

tensor(1.9742, device='cuda:0')

tensor(2.8439, device='cuda:0')

tensor(1.5393, device='cuda:0')

tensor(0.5747, device='cuda:0')

  Batch   440  of    983.    Elapsed: 0:08:50.


tensor(2.4137, device='cuda:0')

tensor(0.4304, device='cuda:0')

tensor(2.7368, device='cuda:0')

tensor(3.8061, device='cuda:0')

tensor(3.3131, device='cuda:0')

tensor(2.8510, device='cuda:0')

tensor(5.5580, device='cuda:0')

tensor(0.2629, device='cuda:0')

tensor(5.2106, device='cuda:0')

tensor(0.6849, device='cuda:0')

tensor(4.4106, device='cuda:0')

tensor(0.3911, device='cuda:0')

tensor(4.4872, device='cuda:0')

tensor(2.3394, device='cuda:0')

tensor(0.5586, device='cuda:0')

tensor(5.7782, device='cuda:0')

tensor(4.5621, device='cuda:0')

tensor(0.8290, device='cuda:0')

tensor(2.8161, device='cuda:0')

tensor(4.9765, device='cuda:0')

tensor(4.1315, device='cuda:0')

tensor(1.2964, device='cuda:0')

tensor(2.4676, device='cuda:0')

tensor(7.2730, device='cuda:0')

tensor(2.4759, device='cuda:0')

tensor(6.2755, device='cuda:0')

tensor(3.5471, device='cuda:0')

tensor(1.0656, device='cuda:0')

tensor(0.4988, device='cuda:0')

tensor(7.0266, device='cuda:0')

tensor(2.1583, device='cuda:0')

tensor(0.4594, device='cuda:0')

tensor(2.8551, device='cuda:0')

tensor(2.2515, device='cuda:0')

tensor(3.1582, device='cuda:0')

tensor(2.5690, device='cuda:0')

tensor(11.2724, device='cuda:0')

tensor(0.3905, device='cuda:0')

tensor(1.9625, device='cuda:0')

tensor(0.0517, device='cuda:0')

  Batch   480  of    983.    Elapsed: 0:09:38.


tensor(0.3933, device='cuda:0')

tensor(6.1389, device='cuda:0')

tensor(1.4006, device='cuda:0')

tensor(9.2425, device='cuda:0')

tensor(1.7033, device='cuda:0')

tensor(17.9012, device='cuda:0')

tensor(0.3632, device='cuda:0')

tensor(0.1285, device='cuda:0')

tensor(0.0996, device='cuda:0')

tensor(12.9788, device='cuda:0')

tensor(0.0243, device='cuda:0')

tensor(0.0698, device='cuda:0')

tensor(0.0288, device='cuda:0')

tensor(0.3453, device='cuda:0')

tensor(6.1043, device='cuda:0')

tensor(4.9230, device='cuda:0')

tensor(6.1633, device='cuda:0')

tensor(4.3982, device='cuda:0')

tensor(7.0636, device='cuda:0')

tensor(4.2663, device='cuda:0')

tensor(10.1167, device='cuda:0')

tensor(1.3874, device='cuda:0')

tensor(11.3358, device='cuda:0')

tensor(4.5510, device='cuda:0')

tensor(8.6775, device='cuda:0')

tensor(4.1888, device='cuda:0')

tensor(2.8288, device='cuda:0')

tensor(8.4045, device='cuda:0')

tensor(7.4303, device='cuda:0')

tensor(2.1543, device='cuda:0')

tensor(0.0327, device='cuda:0')

tensor(0.7525, device='cuda:0')

tensor(0.8936, device='cuda:0')

tensor(6.9085, device='cuda:0')

tensor(0.1025, device='cuda:0')

tensor(1.2461, device='cuda:0')

tensor(6.1941, device='cuda:0')

tensor(9.6636, device='cuda:0')

tensor(6.5122, device='cuda:0')

tensor(3.7175, device='cuda:0')

  Batch   520  of    983.    Elapsed: 0:10:26.


tensor(0.2430, device='cuda:0')

tensor(1.0815, device='cuda:0')

tensor(5.3951, device='cuda:0')

tensor(2.8559, device='cuda:0')

tensor(2.7445, device='cuda:0')

tensor(3.8307, device='cuda:0')

tensor(5.6153, device='cuda:0')

tensor(3.6663, device='cuda:0')

tensor(0.7329, device='cuda:0')

tensor(3.8229, device='cuda:0')

tensor(11.1107, device='cuda:0')

tensor(1.8710, device='cuda:0')

tensor(6.8902, device='cuda:0')

tensor(5.0208, device='cuda:0')

tensor(1.5461, device='cuda:0')

tensor(4.6367, device='cuda:0')

tensor(0.3127, device='cuda:0')

tensor(3.4240, device='cuda:0')

tensor(3.6633, device='cuda:0')

tensor(4.2519, device='cuda:0')

tensor(4.8121, device='cuda:0')

tensor(2.8229, device='cuda:0')

tensor(2.2567, device='cuda:0')

tensor(2.0898, device='cuda:0')

tensor(0.7876, device='cuda:0')

tensor(2.7171, device='cuda:0')

tensor(2.1154, device='cuda:0')

tensor(1.8439, device='cuda:0')

tensor(0.1875, device='cuda:0')

tensor(3.3463, device='cuda:0')

tensor(0.8310, device='cuda:0')

tensor(3.4266, device='cuda:0')

tensor(0.6080, device='cuda:0')

tensor(0.1721, device='cuda:0')

tensor(1.2673, device='cuda:0')

tensor(6.6013, device='cuda:0')

tensor(1.4955, device='cuda:0')

tensor(1.1470, device='cuda:0')

tensor(0.1539, device='cuda:0')

tensor(2.5525, device='cuda:0')

  Batch   560  of    983.    Elapsed: 0:11:14.


tensor(1.2489, device='cuda:0')

tensor(0.8094, device='cuda:0')

tensor(3.2822, device='cuda:0')

tensor(7.6304, device='cuda:0')

tensor(0.1180, device='cuda:0')

tensor(0.0664, device='cuda:0')

tensor(0.0441, device='cuda:0')

tensor(12.6535, device='cuda:0')

tensor(3.2165, device='cuda:0')

tensor(4.4196, device='cuda:0')

tensor(0.1221, device='cuda:0')

tensor(0.0191, device='cuda:0')

tensor(0.1083, device='cuda:0')

tensor(15.9089, device='cuda:0')

tensor(6.2449, device='cuda:0')

tensor(5.4499, device='cuda:0')

tensor(0.4064, device='cuda:0')

tensor(0.3362, device='cuda:0')

tensor(4.4261, device='cuda:0')

tensor(6.9480, device='cuda:0')

tensor(14.8388, device='cuda:0')

tensor(0.8220, device='cuda:0')

tensor(0.5975, device='cuda:0')

tensor(2.9283, device='cuda:0')

tensor(3.1365, device='cuda:0')

tensor(13.9058, device='cuda:0')

tensor(20.5879, device='cuda:0')

tensor(3.1747, device='cuda:0')

tensor(2.5953, device='cuda:0')

tensor(0.0990, device='cuda:0')

tensor(0.1109, device='cuda:0')

tensor(8.1546, device='cuda:0')

tensor(7.9053, device='cuda:0')

tensor(0.3240, device='cuda:0')

tensor(9.1946, device='cuda:0')

tensor(3.3828, device='cuda:0')

tensor(9.8623, device='cuda:0')

tensor(17.1267, device='cuda:0')

tensor(10.2356, device='cuda:0')

tensor(4.8916, device='cuda:0')

  Batch   600  of    983.    Elapsed: 0:12:02.


tensor(2.5830, device='cuda:0')

tensor(6.7197, device='cuda:0')

tensor(0.1331, device='cuda:0')

tensor(5.7228, device='cuda:0')

tensor(4.8793, device='cuda:0')

tensor(4.9170, device='cuda:0')

tensor(7.9827, device='cuda:0')

tensor(2.7595, device='cuda:0')

tensor(6.7921, device='cuda:0')

tensor(0.3949, device='cuda:0')

tensor(0.8406, device='cuda:0')

tensor(1.3154, device='cuda:0')

tensor(1.1325, device='cuda:0')

tensor(0.5574, device='cuda:0')

tensor(5.0751, device='cuda:0')

tensor(6.0429, device='cuda:0')

tensor(10.2054, device='cuda:0')

tensor(8.3941, device='cuda:0')

tensor(9.3441, device='cuda:0')

tensor(7.1441, device='cuda:0')

tensor(3.1352, device='cuda:0')

tensor(3.3436, device='cuda:0')

tensor(2.4927, device='cuda:0')

tensor(5.9212, device='cuda:0')

tensor(6.4998, device='cuda:0')

tensor(2.2904, device='cuda:0')

tensor(2.2912, device='cuda:0')

tensor(0.2448, device='cuda:0')

tensor(3.1775, device='cuda:0')

tensor(0.2907, device='cuda:0')

tensor(1.6487, device='cuda:0')

tensor(2.4116, device='cuda:0')

tensor(4.3341, device='cuda:0')

tensor(5.7642, device='cuda:0')

tensor(1.5792, device='cuda:0')

tensor(2.4289, device='cuda:0')

tensor(3.4509, device='cuda:0')

tensor(2.7969, device='cuda:0')

tensor(3.8577, device='cuda:0')

tensor(3.5752, device='cuda:0')

  Batch   640  of    983.    Elapsed: 0:12:50.


tensor(0.7309, device='cuda:0')

tensor(2.7984, device='cuda:0')

tensor(3.3318, device='cuda:0')

tensor(0.6402, device='cuda:0')

tensor(4.3667, device='cuda:0')

tensor(0.4465, device='cuda:0')

tensor(4.8432, device='cuda:0')

tensor(1.8396, device='cuda:0')

tensor(2.7486, device='cuda:0')

tensor(0.1535, device='cuda:0')

tensor(4.4480, device='cuda:0')

tensor(4.0713, device='cuda:0')

tensor(0.1715, device='cuda:0')

tensor(8.2485, device='cuda:0')

tensor(2.8587, device='cuda:0')

tensor(5.3237, device='cuda:0')

tensor(0.0844, device='cuda:0')

tensor(6.0903, device='cuda:0')

tensor(2.8444, device='cuda:0')

tensor(0.3906, device='cuda:0')

tensor(7.6935, device='cuda:0')

tensor(3.8530, device='cuda:0')

tensor(0.9175, device='cuda:0')

tensor(6.6803, device='cuda:0')

tensor(5.7133, device='cuda:0')

tensor(0.9341, device='cuda:0')

tensor(3.4244, device='cuda:0')

tensor(5.2022, device='cuda:0')

tensor(2.9341, device='cuda:0')

tensor(5.5564, device='cuda:0')

tensor(1.2717, device='cuda:0')

tensor(3.6775, device='cuda:0')

tensor(7.6654, device='cuda:0')

tensor(3.8446, device='cuda:0')

tensor(4.1480, device='cuda:0')

tensor(1.9115, device='cuda:0')

tensor(0.0590, device='cuda:0')

tensor(0.2453, device='cuda:0')

tensor(4.4718, device='cuda:0')

tensor(0.5664, device='cuda:0')

  Batch   680  of    983.    Elapsed: 0:13:39.


tensor(0.3244, device='cuda:0')

tensor(5.8046, device='cuda:0')

tensor(2.3538, device='cuda:0')

tensor(3.4162, device='cuda:0')

tensor(0.8403, device='cuda:0')

tensor(0.4392, device='cuda:0')

tensor(2.6602, device='cuda:0')

tensor(4.4603, device='cuda:0')

tensor(2.0699, device='cuda:0')

tensor(0.4492, device='cuda:0')

tensor(1.4179, device='cuda:0')

tensor(4.0767, device='cuda:0')

tensor(1.7426, device='cuda:0')

tensor(0.2976, device='cuda:0')

tensor(3.6325, device='cuda:0')

tensor(0.4157, device='cuda:0')

tensor(0.4759, device='cuda:0')

tensor(0.5087, device='cuda:0')

tensor(0.1307, device='cuda:0')

tensor(0.1960, device='cuda:0')

tensor(5.7744, device='cuda:0')

tensor(2.0728, device='cuda:0')

tensor(5.4362, device='cuda:0')

tensor(5.5844, device='cuda:0')

tensor(2.5065, device='cuda:0')

tensor(6.1535, device='cuda:0')

tensor(3.5846, device='cuda:0')

tensor(0.2734, device='cuda:0')

tensor(4.0260, device='cuda:0')

tensor(3.8452, device='cuda:0')

tensor(2.2029, device='cuda:0')

tensor(5.1720, device='cuda:0')

tensor(0.5973, device='cuda:0')

tensor(5.3497, device='cuda:0')

tensor(5.4240, device='cuda:0')

tensor(2.4537, device='cuda:0')

tensor(0.0575, device='cuda:0')

tensor(0.1203, device='cuda:0')

tensor(0.3646, device='cuda:0')

tensor(0.6152, device='cuda:0')

  Batch   720  of    983.    Elapsed: 0:14:27.


tensor(0.3239, device='cuda:0')

tensor(4.2198, device='cuda:0')

tensor(6.3224, device='cuda:0')

tensor(3.2910, device='cuda:0')

tensor(9.6938, device='cuda:0')

tensor(1.3474, device='cuda:0')

tensor(0.2343, device='cuda:0')

tensor(18.5472, device='cuda:0')

tensor(8.8146, device='cuda:0')

tensor(10.7917, device='cuda:0')

tensor(3.1431, device='cuda:0')

tensor(1.6187, device='cuda:0')

tensor(4.5594, device='cuda:0')

tensor(4.7824, device='cuda:0')

tensor(4.9501, device='cuda:0')

tensor(9.9499, device='cuda:0')

tensor(19.7379, device='cuda:0')

tensor(3.2619, device='cuda:0')

tensor(0.0839, device='cuda:0')

tensor(6.0794, device='cuda:0')

tensor(4.2210, device='cuda:0')

tensor(4.3127, device='cuda:0')

tensor(0.1101, device='cuda:0')

tensor(1.1053, device='cuda:0')

tensor(6.4185, device='cuda:0')

tensor(5.8541, device='cuda:0')

tensor(6.8157, device='cuda:0')

tensor(0.1062, device='cuda:0')

tensor(3.7229, device='cuda:0')

tensor(1.7862, device='cuda:0')

tensor(1.0435, device='cuda:0')

tensor(0.8886, device='cuda:0')

tensor(0.5187, device='cuda:0')

tensor(0.3610, device='cuda:0')

tensor(7.2720, device='cuda:0')

tensor(8.2316, device='cuda:0')

tensor(3.2497, device='cuda:0')

tensor(9.4858, device='cuda:0')

tensor(1.8758, device='cuda:0')

tensor(1.3866, device='cuda:0')

  Batch   760  of    983.    Elapsed: 0:15:15.


tensor(0.1581, device='cuda:0')

tensor(15.3767, device='cuda:0')

tensor(2.5568, device='cuda:0')

tensor(6.4671, device='cuda:0')

tensor(2.0979, device='cuda:0')

tensor(0.2130, device='cuda:0')

tensor(0.1759, device='cuda:0')

tensor(1.2896, device='cuda:0')

tensor(1.6696, device='cuda:0')

tensor(4.9644, device='cuda:0')

tensor(5.1849, device='cuda:0')

tensor(0.2722, device='cuda:0')

tensor(5.8077, device='cuda:0')

tensor(0.0632, device='cuda:0')

tensor(1.2607, device='cuda:0')

tensor(0.0826, device='cuda:0')

tensor(3.4520, device='cuda:0')

tensor(2.1981, device='cuda:0')

tensor(5.8288, device='cuda:0')

tensor(0.3935, device='cuda:0')

tensor(3.6005, device='cuda:0')

tensor(7.4449, device='cuda:0')

tensor(9.5581, device='cuda:0')

tensor(7.0596, device='cuda:0')

tensor(1.2906, device='cuda:0')

tensor(5.2997, device='cuda:0')

tensor(5.2711, device='cuda:0')

tensor(7.1862, device='cuda:0')

tensor(0.0487, device='cuda:0')

tensor(0.7320, device='cuda:0')

tensor(0.0645, device='cuda:0')

tensor(4.7589, device='cuda:0')

tensor(0.0774, device='cuda:0')

tensor(4.1537, device='cuda:0')

tensor(5.0528, device='cuda:0')

tensor(3.3224, device='cuda:0')

tensor(0.8673, device='cuda:0')

tensor(6.1049, device='cuda:0')

tensor(4.6041, device='cuda:0')

tensor(0.0829, device='cuda:0')

  Batch   800  of    983.    Elapsed: 0:16:03.


tensor(0.2463, device='cuda:0')

tensor(1.3046, device='cuda:0')

tensor(0.1104, device='cuda:0')

tensor(2.3267, device='cuda:0')

tensor(1.0449, device='cuda:0')

tensor(5.1241, device='cuda:0')

tensor(5.2715, device='cuda:0')

tensor(4.5095, device='cuda:0')

tensor(0.0385, device='cuda:0')

tensor(0.2205, device='cuda:0')

tensor(1.3335, device='cuda:0')

tensor(0.5197, device='cuda:0')

tensor(0.1306, device='cuda:0')

tensor(1.0274, device='cuda:0')

tensor(6.2076, device='cuda:0')

tensor(0.5885, device='cuda:0')

tensor(1.1234, device='cuda:0')

tensor(0.3294, device='cuda:0')

tensor(5.2190, device='cuda:0')

tensor(9.4857, device='cuda:0')

tensor(1.0615, device='cuda:0')

tensor(0.1902, device='cuda:0')

tensor(0.6303, device='cuda:0')

tensor(0.1349, device='cuda:0')

tensor(7.6989, device='cuda:0')

tensor(0.6023, device='cuda:0')

tensor(3.5846, device='cuda:0')

tensor(0.3470, device='cuda:0')

tensor(0.1862, device='cuda:0')

tensor(2.0876, device='cuda:0')

tensor(4.5612, device='cuda:0')

tensor(3.6004, device='cuda:0')

tensor(0.0285, device='cuda:0')

tensor(9.3490, device='cuda:0')

tensor(4.8155, device='cuda:0')

tensor(1.1589, device='cuda:0')

tensor(9.8204, device='cuda:0')

tensor(0.2363, device='cuda:0')

tensor(0.1057, device='cuda:0')

tensor(2.8368, device='cuda:0')

  Batch   840  of    983.    Elapsed: 0:16:51.


tensor(1.0316, device='cuda:0')

tensor(0.4615, device='cuda:0')

tensor(8.3271, device='cuda:0')

tensor(3.3256, device='cuda:0')

tensor(6.8720, device='cuda:0')

tensor(6.4127, device='cuda:0')

tensor(0.2039, device='cuda:0')

tensor(6.0711, device='cuda:0')

tensor(0.5536, device='cuda:0')

tensor(0.5480, device='cuda:0')

tensor(2.1910, device='cuda:0')

tensor(1.7934, device='cuda:0')

tensor(9.4270, device='cuda:0')

tensor(5.3370, device='cuda:0')

tensor(0.1953, device='cuda:0')

tensor(4.3505, device='cuda:0')

tensor(0.2058, device='cuda:0')

tensor(0.2126, device='cuda:0')

tensor(0.3478, device='cuda:0')

tensor(6.3049, device='cuda:0')

tensor(2.3312, device='cuda:0')

tensor(0.0980, device='cuda:0')

tensor(0.1517, device='cuda:0')

tensor(1.5417, device='cuda:0')

tensor(5.6098, device='cuda:0')

tensor(4.0984, device='cuda:0')

tensor(0.1272, device='cuda:0')

tensor(0.1720, device='cuda:0')

tensor(2.5883, device='cuda:0')

tensor(1.7023, device='cuda:0')

tensor(1.1699, device='cuda:0')

tensor(3.1784, device='cuda:0')

tensor(5.7865, device='cuda:0')

tensor(6.3462, device='cuda:0')

tensor(0.3069, device='cuda:0')

tensor(3.3134, device='cuda:0')

tensor(0.4353, device='cuda:0')

tensor(5.7084, device='cuda:0')

tensor(0.4296, device='cuda:0')

tensor(8.1014, device='cuda:0')

  Batch   880  of    983.    Elapsed: 0:17:40.


tensor(3.7069, device='cuda:0')

tensor(0.3499, device='cuda:0')

tensor(1.0520, device='cuda:0')

tensor(2.0361, device='cuda:0')

tensor(0.5369, device='cuda:0')

tensor(0.3466, device='cuda:0')

tensor(3.4496, device='cuda:0')

tensor(4.7837, device='cuda:0')

tensor(0.0608, device='cuda:0')

tensor(3.3792, device='cuda:0')

tensor(1.1467, device='cuda:0')

tensor(2.2745, device='cuda:0')

tensor(3.1655, device='cuda:0')

tensor(6.7196, device='cuda:0')

tensor(7.0377, device='cuda:0')

tensor(0.0824, device='cuda:0')

tensor(5.8089, device='cuda:0')

tensor(0.1397, device='cuda:0')

tensor(3.8198, device='cuda:0')

tensor(7.8977, device='cuda:0')

tensor(4.3903, device='cuda:0')

tensor(0.3530, device='cuda:0')

tensor(7.9921, device='cuda:0')

tensor(1.9806, device='cuda:0')

tensor(0.0411, device='cuda:0')

tensor(0.0841, device='cuda:0')

tensor(6.8955, device='cuda:0')

tensor(0.6790, device='cuda:0')

tensor(11.7221, device='cuda:0')

tensor(6.3838, device='cuda:0')

tensor(2.3706, device='cuda:0')

tensor(5.0048, device='cuda:0')

tensor(0.2630, device='cuda:0')

tensor(0.1119, device='cuda:0')

tensor(0.0557, device='cuda:0')

tensor(6.1895, device='cuda:0')

tensor(6.6809, device='cuda:0')

tensor(1.7108, device='cuda:0')

tensor(0.2849, device='cuda:0')

tensor(1.3442, device='cuda:0')

  Batch   920  of    983.    Elapsed: 0:18:28.


tensor(1.6507, device='cuda:0')

tensor(1.6793, device='cuda:0')

tensor(0.1475, device='cuda:0')

tensor(1.8818, device='cuda:0')

tensor(0.6637, device='cuda:0')

tensor(0.5507, device='cuda:0')

tensor(6.0453, device='cuda:0')

tensor(0.9722, device='cuda:0')

tensor(5.1992, device='cuda:0')

tensor(7.9385, device='cuda:0')

tensor(0.0175, device='cuda:0')

tensor(2.2996, device='cuda:0')

tensor(3.2113, device='cuda:0')

tensor(9.4813, device='cuda:0')

tensor(0.1454, device='cuda:0')

tensor(6.0220, device='cuda:0')

tensor(0.1563, device='cuda:0')

tensor(0.2948, device='cuda:0')

tensor(0.3518, device='cuda:0')

tensor(2.8160, device='cuda:0')

tensor(3.5079, device='cuda:0')

tensor(3.1028, device='cuda:0')

tensor(3.0501, device='cuda:0')

tensor(0.4168, device='cuda:0')

tensor(4.9018, device='cuda:0')

tensor(0.0852, device='cuda:0')

tensor(0.4464, device='cuda:0')

tensor(5.9273, device='cuda:0')

tensor(7.2204, device='cuda:0')

tensor(0.3760, device='cuda:0')

tensor(0.2793, device='cuda:0')

tensor(0.2387, device='cuda:0')

tensor(1.7485, device='cuda:0')

tensor(0.2076, device='cuda:0')

tensor(2.1482, device='cuda:0')

tensor(0.2343, device='cuda:0')

tensor(5.7134, device='cuda:0')

tensor(0.8307, device='cuda:0')

tensor(3.8759, device='cuda:0')

tensor(2.7247, device='cuda:0')

  Batch   960  of    983.    Elapsed: 0:19:16.


tensor(2.8343, device='cuda:0')

tensor(1.0547, device='cuda:0')

tensor(4.3336, device='cuda:0')

tensor(10.6383, device='cuda:0')

tensor(2.5365, device='cuda:0')

tensor(0.2237, device='cuda:0')

tensor(6.0637, device='cuda:0')

tensor(11.3305, device='cuda:0')

tensor(1.5170, device='cuda:0')

tensor(0.0825, device='cuda:0')

tensor(1.2437, device='cuda:0')

tensor(0.0407, device='cuda:0')

tensor(6.5320, device='cuda:0')

tensor(6.6796, device='cuda:0')

tensor(0.1197, device='cuda:0')

tensor(0.1630, device='cuda:0')

tensor(0.0799, device='cuda:0')

tensor(5.5505, device='cuda:0')

tensor(0.3333, device='cuda:0')

tensor(0.0806, device='cuda:0')

tensor(1.2200, device='cuda:0')

tensor(10.9265, device='cuda:0')

tensor(0.0790, device='cuda:0')


  Average training loss: 0.09
  Training epcoh took: 0:19:43

Running Validation...


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

  Accuracy: 0.98
  Validation Loss: 0.08
  Validation took: 0:02:01

Training...


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

tensor(1.1244, device='cuda:0')

tensor(1.0392, device='cuda:0')

tensor(0.4022, device='cuda:0')

tensor(0.0964, device='cuda:0')

tensor(0.0270, device='cuda:0')

tensor(3.5772, device='cuda:0')

tensor(1.1578, device='cuda:0')

tensor(0.5659, device='cuda:0')

tensor(0.1574, device='cuda:0')

tensor(3.3395, device='cuda:0')

tensor(0.1910, device='cuda:0')

tensor(0.1118, device='cuda:0')

tensor(4.8130, device='cuda:0')

tensor(0.1233, device='cuda:0')

tensor(0.5381, device='cuda:0')

tensor(11.6534, device='cuda:0')

tensor(0.0784, device='cuda:0')

tensor(0.2167, device='cuda:0')

tensor(0.3432, device='cuda:0')

tensor(7.2863, device='cuda:0')

tensor(0.1720, device='cuda:0')

tensor(0.0897, device='cuda:0')

tensor(0.0570, device='cuda:0')

tensor(4.7987, device='cuda:0')

tensor(0.4349, device='cuda:0')

tensor(5.7593, device='cuda:0')

tensor(0.1023, device='cuda:0')

tensor(8.7628, device='cuda:0')

tensor(0.2434, device='cuda:0')

tensor(2.1179, device='cuda:0')

tensor(0.0193, device='cuda:0')

tensor(0.4178, device='cuda:0')

tensor(0.1624, device='cuda:0')

tensor(0.1302, device='cuda:0')

tensor(8.7883, device='cuda:0')

tensor(0.2878, device='cuda:0')

tensor(0.0243, device='cuda:0')

tensor(0.0735, device='cuda:0')

tensor(0.0813, device='cuda:0')

tensor(0.4193, device='cuda:0')

  Batch    40  of    983.    Elapsed: 0:00:48.


tensor(0.2800, device='cuda:0')

tensor(26.7245, device='cuda:0')

tensor(4.2747, device='cuda:0')

tensor(9.8862, device='cuda:0')

tensor(0.0516, device='cuda:0')

tensor(2.0924, device='cuda:0')

tensor(0.0341, device='cuda:0')

tensor(0.0127, device='cuda:0')

tensor(0.0801, device='cuda:0')

tensor(0.0104, device='cuda:0')

tensor(9.7203, device='cuda:0')

tensor(0.0091, device='cuda:0')

tensor(0.0051, device='cuda:0')

tensor(0.0978, device='cuda:0')

tensor(0.0409, device='cuda:0')

tensor(0.0075, device='cuda:0')

tensor(0.0124, device='cuda:0')

tensor(0.0553, device='cuda:0')

tensor(13.7058, device='cuda:0')

tensor(0.0053, device='cuda:0')

tensor(0.0031, device='cuda:0')

tensor(4.3009, device='cuda:0')

tensor(0.0137, device='cuda:0')

tensor(0.1101, device='cuda:0')

tensor(0.0485, device='cuda:0')

tensor(5.0604, device='cuda:0')

tensor(0.0052, device='cuda:0')

tensor(2.1737, device='cuda:0')

tensor(0.5003, device='cuda:0')

tensor(0.4478, device='cuda:0')

tensor(0.1391, device='cuda:0')

tensor(10.4848, device='cuda:0')

tensor(11.3390, device='cuda:0')

tensor(11.3980, device='cuda:0')

tensor(6.4511, device='cuda:0')

tensor(0.0116, device='cuda:0')

tensor(5.3200, device='cuda:0')

tensor(0.2183, device='cuda:0')

tensor(12.8905, device='cuda:0')

tensor(0.0579, device='cuda:0')

  Batch    80  of    983.    Elapsed: 0:01:37.


tensor(1.2945, device='cuda:0')

tensor(0.5117, device='cuda:0')

tensor(0.5821, device='cuda:0')

tensor(15.2766, device='cuda:0')

tensor(0.0892, device='cuda:0')

tensor(0.0136, device='cuda:0')

tensor(0.2968, device='cuda:0')

tensor(0.0315, device='cuda:0')

tensor(0.0177, device='cuda:0')

tensor(0.2715, device='cuda:0')

tensor(1.1018, device='cuda:0')

tensor(1.6251, device='cuda:0')

tensor(1.1579, device='cuda:0')

tensor(0.0098, device='cuda:0')

tensor(0.0828, device='cuda:0')

tensor(0.0271, device='cuda:0')

tensor(0.0132, device='cuda:0')

tensor(5.1396, device='cuda:0')

tensor(5.0098, device='cuda:0')

tensor(0.0625, device='cuda:0')

tensor(0.1645, device='cuda:0')

tensor(0.7330, device='cuda:0')

tensor(0.0917, device='cuda:0')

tensor(11.5996, device='cuda:0')

tensor(0.0395, device='cuda:0')

tensor(0.5335, device='cuda:0')

tensor(3.8528, device='cuda:0')

tensor(0.0590, device='cuda:0')

tensor(7.6534, device='cuda:0')

tensor(2.7187, device='cuda:0')

tensor(0.0175, device='cuda:0')

tensor(0.0168, device='cuda:0')

tensor(0.0368, device='cuda:0')

tensor(0.7661, device='cuda:0')

tensor(0.1397, device='cuda:0')

tensor(0.0824, device='cuda:0')

tensor(0.1795, device='cuda:0')

tensor(0.0442, device='cuda:0')

tensor(0.0783, device='cuda:0')

tensor(1.1947, device='cuda:0')

  Batch   120  of    983.    Elapsed: 0:02:25.


tensor(2.3868, device='cuda:0')

tensor(0.0276, device='cuda:0')

tensor(0.2612, device='cuda:0')

tensor(0.0237, device='cuda:0')

tensor(8.5573, device='cuda:0')

tensor(1.6135, device='cuda:0')

tensor(0.1584, device='cuda:0')

tensor(0.9630, device='cuda:0')

tensor(0.0575, device='cuda:0')

tensor(0.0370, device='cuda:0')

tensor(0.0276, device='cuda:0')

tensor(0.6796, device='cuda:0')

tensor(8.6011, device='cuda:0')

tensor(5.9460, device='cuda:0')

tensor(0.3255, device='cuda:0')

tensor(0.2425, device='cuda:0')

tensor(4.6479, device='cuda:0')

tensor(0.0518, device='cuda:0')

tensor(0.0623, device='cuda:0')

tensor(0.5555, device='cuda:0')

tensor(0.0282, device='cuda:0')

tensor(0.0218, device='cuda:0')

tensor(7.0686, device='cuda:0')

tensor(0.0679, device='cuda:0')

tensor(1.4516, device='cuda:0')

tensor(11.9089, device='cuda:0')

tensor(0.3941, device='cuda:0')

tensor(0.0279, device='cuda:0')

tensor(0.0394, device='cuda:0')

tensor(6.5213, device='cuda:0')

tensor(0.0216, device='cuda:0')

tensor(0.1151, device='cuda:0')

tensor(4.0946, device='cuda:0')

tensor(0.0324, device='cuda:0')

tensor(0.0068, device='cuda:0')

tensor(3.3809, device='cuda:0')

tensor(0.0241, device='cuda:0')

tensor(1.5853, device='cuda:0')

tensor(0.0045, device='cuda:0')

tensor(0.0131, device='cuda:0')

  Batch   160  of    983.    Elapsed: 0:03:13.


tensor(0.1149, device='cuda:0')

tensor(0.0858, device='cuda:0')

tensor(6.5731, device='cuda:0')

tensor(8.3347, device='cuda:0')

tensor(0.0643, device='cuda:0')

tensor(0.0772, device='cuda:0')

tensor(2.7140, device='cuda:0')

tensor(0.0829, device='cuda:0')

tensor(0.0416, device='cuda:0')

tensor(0.0290, device='cuda:0')

tensor(1.3901, device='cuda:0')

tensor(12.1922, device='cuda:0')

tensor(0.0136, device='cuda:0')

tensor(0.3607, device='cuda:0')

tensor(0.2415, device='cuda:0')

tensor(0.1391, device='cuda:0')

tensor(0.2332, device='cuda:0')

tensor(0.0189, device='cuda:0')

tensor(2.7546, device='cuda:0')

tensor(0.0628, device='cuda:0')

tensor(2.8861, device='cuda:0')

tensor(5.1606, device='cuda:0')

tensor(0.0639, device='cuda:0')

tensor(0.0620, device='cuda:0')

tensor(0.0371, device='cuda:0')

tensor(3.1220, device='cuda:0')

tensor(1.3590, device='cuda:0')

tensor(0.2746, device='cuda:0')

tensor(0.0300, device='cuda:0')

tensor(0.0140, device='cuda:0')

tensor(0.0593, device='cuda:0')

tensor(6.0675, device='cuda:0')

tensor(2.4217, device='cuda:0')

tensor(7.5637, device='cuda:0')

tensor(0.2169, device='cuda:0')

tensor(0.1585, device='cuda:0')

tensor(0.0426, device='cuda:0')

tensor(0.0277, device='cuda:0')

tensor(5.1288, device='cuda:0')

tensor(0.1417, device='cuda:0')

  Batch   200  of    983.    Elapsed: 0:04:01.


tensor(0.0275, device='cuda:0')

tensor(0.0213, device='cuda:0')

tensor(7.1180, device='cuda:0')

tensor(0.1741, device='cuda:0')

tensor(1.4256, device='cuda:0')

tensor(0.0301, device='cuda:0')

tensor(0.2383, device='cuda:0')

tensor(3.9050, device='cuda:0')

tensor(0.0655, device='cuda:0')

tensor(0.0182, device='cuda:0')

tensor(0.0455, device='cuda:0')

tensor(5.0726, device='cuda:0')

tensor(4.6326, device='cuda:0')

tensor(9.8699, device='cuda:0')

tensor(0.6538, device='cuda:0')

tensor(7.2051, device='cuda:0')

tensor(0.9419, device='cuda:0')

tensor(0.0384, device='cuda:0')

tensor(0.3912, device='cuda:0')

tensor(4.1044, device='cuda:0')

tensor(0.0485, device='cuda:0')

tensor(0.0159, device='cuda:0')

tensor(0.2202, device='cuda:0')

tensor(0.0222, device='cuda:0')

tensor(0.2768, device='cuda:0')

tensor(0.6592, device='cuda:0')

tensor(4.2099, device='cuda:0')

tensor(0.3204, device='cuda:0')

tensor(0.0643, device='cuda:0')

tensor(2.7671, device='cuda:0')

tensor(2.7456, device='cuda:0')

tensor(0.0897, device='cuda:0')

tensor(0.1881, device='cuda:0')

tensor(0.0612, device='cuda:0')

tensor(0.0986, device='cuda:0')

tensor(9.1017, device='cuda:0')

tensor(0.1146, device='cuda:0')

tensor(0.2725, device='cuda:0')

tensor(0.0958, device='cuda:0')

tensor(1.3746, device='cuda:0')

  Batch   240  of    983.    Elapsed: 0:04:49.


tensor(2.5645, device='cuda:0')

tensor(0.0236, device='cuda:0')

tensor(0.4367, device='cuda:0')

tensor(0.0707, device='cuda:0')

tensor(5.4578, device='cuda:0')

tensor(0.0475, device='cuda:0')

tensor(0.0470, device='cuda:0')

tensor(0.1172, device='cuda:0')

tensor(0.0426, device='cuda:0')

tensor(0.2378, device='cuda:0')

tensor(6.4390, device='cuda:0')

tensor(0.0586, device='cuda:0')

tensor(0.0138, device='cuda:0')

tensor(0.2851, device='cuda:0')

tensor(2.1640, device='cuda:0')

tensor(10.3994, device='cuda:0')

tensor(1.9911, device='cuda:0')

tensor(0.0622, device='cuda:0')

tensor(0.1132, device='cuda:0')

tensor(0.0170, device='cuda:0')

tensor(0.0549, device='cuda:0')

tensor(0.1017, device='cuda:0')

tensor(0.0314, device='cuda:0')

tensor(0.1364, device='cuda:0')

tensor(0.2136, device='cuda:0')

tensor(3.1066, device='cuda:0')

tensor(0.0121, device='cuda:0')

tensor(0.0627, device='cuda:0')

tensor(0.4320, device='cuda:0')

tensor(8.4589, device='cuda:0')

tensor(12.2032, device='cuda:0')

tensor(0.0264, device='cuda:0')

tensor(0.0415, device='cuda:0')

tensor(0.2282, device='cuda:0')

tensor(0.0356, device='cuda:0')

tensor(0.0511, device='cuda:0')

tensor(10.8378, device='cuda:0')

tensor(0.0482, device='cuda:0')

tensor(0.0044, device='cuda:0')

tensor(11.1369, device='cuda:0')

  Batch   280  of    983.    Elapsed: 0:05:37.


tensor(2.3721, device='cuda:0')

tensor(0.0688, device='cuda:0')

tensor(0.0609, device='cuda:0')

tensor(0.0067, device='cuda:0')

tensor(1.9274, device='cuda:0')

tensor(0.0584, device='cuda:0')

tensor(10.9862, device='cuda:0')

tensor(0.1770, device='cuda:0')

tensor(0.3863, device='cuda:0')

tensor(0.0172, device='cuda:0')

tensor(3.5339, device='cuda:0')

tensor(0.1122, device='cuda:0')

tensor(1.5838, device='cuda:0')

tensor(7.1125, device='cuda:0')

tensor(6.0181, device='cuda:0')

tensor(0.3254, device='cuda:0')

tensor(0.0708, device='cuda:0')

tensor(0.1189, device='cuda:0')

tensor(9.5069, device='cuda:0')

tensor(0.0496, device='cuda:0')

tensor(0.0105, device='cuda:0')

tensor(1.7476, device='cuda:0')

tensor(0.1117, device='cuda:0')

tensor(0.2097, device='cuda:0')

tensor(0.0270, device='cuda:0')

tensor(6.4130, device='cuda:0')

tensor(1.2739, device='cuda:0')

tensor(3.1006, device='cuda:0')

tensor(0.0458, device='cuda:0')

tensor(0.3111, device='cuda:0')

tensor(0.1516, device='cuda:0')

tensor(0.2614, device='cuda:0')

tensor(0.1328, device='cuda:0')

tensor(1.4185, device='cuda:0')

tensor(0.0573, device='cuda:0')

tensor(0.0309, device='cuda:0')

tensor(1.0305, device='cuda:0')

tensor(6.2482, device='cuda:0')

tensor(3.8125, device='cuda:0')

tensor(13.4014, device='cuda:0')

  Batch   320  of    983.    Elapsed: 0:06:26.


tensor(0.0696, device='cuda:0')

tensor(0.0234, device='cuda:0')

tensor(0.0171, device='cuda:0')

tensor(0.0179, device='cuda:0')

tensor(0.4973, device='cuda:0')

tensor(0.0710, device='cuda:0')

tensor(0.0312, device='cuda:0')

tensor(0.5915, device='cuda:0')

tensor(0.0356, device='cuda:0')

tensor(0.0100, device='cuda:0')

tensor(8.1064, device='cuda:0')

tensor(0.7393, device='cuda:0')

tensor(0.1659, device='cuda:0')

tensor(0.0239, device='cuda:0')

tensor(8.4441, device='cuda:0')

tensor(0.6052, device='cuda:0')

tensor(0.0242, device='cuda:0')

tensor(0.3062, device='cuda:0')

tensor(0.0183, device='cuda:0')

tensor(5.8499, device='cuda:0')

tensor(1.3567, device='cuda:0')

tensor(0.3680, device='cuda:0')

tensor(0.0933, device='cuda:0')

tensor(0.2500, device='cuda:0')

tensor(5.7498, device='cuda:0')

tensor(0.1011, device='cuda:0')

tensor(0.0034, device='cuda:0')

tensor(0.0597, device='cuda:0')

tensor(0.0505, device='cuda:0')

tensor(0.0335, device='cuda:0')

tensor(12.4578, device='cuda:0')

tensor(0.2944, device='cuda:0')

tensor(0.0895, device='cuda:0')

tensor(0.5065, device='cuda:0')

tensor(0.1356, device='cuda:0')

tensor(5.3438, device='cuda:0')

tensor(0.0538, device='cuda:0')

tensor(3.5331, device='cuda:0')

tensor(0.0327, device='cuda:0')

tensor(6.8450, device='cuda:0')

  Batch   360  of    983.    Elapsed: 0:07:14.


tensor(0.0336, device='cuda:0')

tensor(0.0630, device='cuda:0')

tensor(11.6972, device='cuda:0')

tensor(0.0701, device='cuda:0')

tensor(0.0064, device='cuda:0')

tensor(0.0420, device='cuda:0')

tensor(0.1462, device='cuda:0')

tensor(0.1302, device='cuda:0')

tensor(9.9198, device='cuda:0')

tensor(0.0104, device='cuda:0')

tensor(0.0131, device='cuda:0')

tensor(0.1219, device='cuda:0')

tensor(0.0440, device='cuda:0')

tensor(0.0118, device='cuda:0')

tensor(0.0376, device='cuda:0')

tensor(0.0236, device='cuda:0')

tensor(0.0327, device='cuda:0')

tensor(2.0419, device='cuda:0')

tensor(0.1041, device='cuda:0')

tensor(0.0486, device='cuda:0')

tensor(2.5303, device='cuda:0')

tensor(0.0107, device='cuda:0')

tensor(0.0114, device='cuda:0')

tensor(0.0496, device='cuda:0')

tensor(0.0587, device='cuda:0')

tensor(7.7718, device='cuda:0')

tensor(0.3108, device='cuda:0')

tensor(0.1343, device='cuda:0')

tensor(0.1258, device='cuda:0')

tensor(3.7707, device='cuda:0')

tensor(0.6122, device='cuda:0')

tensor(15.0906, device='cuda:0')

tensor(0.0222, device='cuda:0')

tensor(0.2077, device='cuda:0')

tensor(0.1824, device='cuda:0')

tensor(0.0323, device='cuda:0')

tensor(1.8757, device='cuda:0')

tensor(0.0120, device='cuda:0')

tensor(0.0253, device='cuda:0')

tensor(2.9258, device='cuda:0')

  Batch   400  of    983.    Elapsed: 0:08:02.


tensor(0.0843, device='cuda:0')

tensor(3.7193, device='cuda:0')

tensor(4.4504, device='cuda:0')

tensor(0.1236, device='cuda:0')

tensor(0.0977, device='cuda:0')

tensor(0.2142, device='cuda:0')

tensor(0.0105, device='cuda:0')

tensor(2.3177, device='cuda:0')

tensor(0.0344, device='cuda:0')

tensor(0.2694, device='cuda:0')

tensor(0.1481, device='cuda:0')

tensor(0.0560, device='cuda:0')

tensor(8.9376, device='cuda:0')

tensor(0.0464, device='cuda:0')

tensor(6.1628, device='cuda:0')

tensor(0.0087, device='cuda:0')

tensor(0.0175, device='cuda:0')

tensor(0.0396, device='cuda:0')

tensor(0.0157, device='cuda:0')

tensor(0.0324, device='cuda:0')

tensor(2.2216, device='cuda:0')

tensor(0.0920, device='cuda:0')

tensor(7.6657, device='cuda:0')

tensor(0.0405, device='cuda:0')

tensor(0.2226, device='cuda:0')

tensor(0.9155, device='cuda:0')

tensor(0.9358, device='cuda:0')

tensor(0.0542, device='cuda:0')

tensor(0.1201, device='cuda:0')

tensor(0.0088, device='cuda:0')

tensor(1.2684, device='cuda:0')

tensor(0.0490, device='cuda:0')

tensor(0.2021, device='cuda:0')

tensor(0.1030, device='cuda:0')

tensor(0.6171, device='cuda:0')

tensor(0.0708, device='cuda:0')

tensor(0.0522, device='cuda:0')

tensor(29.4678, device='cuda:0')

tensor(5.2693, device='cuda:0')

tensor(0.0877, device='cuda:0')

  Batch   440  of    983.    Elapsed: 0:08:50.


tensor(1.4626, device='cuda:0')

tensor(0.1481, device='cuda:0')

tensor(4.4358, device='cuda:0')

tensor(6.0420, device='cuda:0')

tensor(20.5042, device='cuda:0')

tensor(0.0315, device='cuda:0')

tensor(2.8397, device='cuda:0')

tensor(0.1395, device='cuda:0')

tensor(6.0534, device='cuda:0')

tensor(0.0128, device='cuda:0')

tensor(0.1185, device='cuda:0')

tensor(0.2566, device='cuda:0')

tensor(7.1436, device='cuda:0')

tensor(0.6633, device='cuda:0')

tensor(0.0536, device='cuda:0')

tensor(0.4155, device='cuda:0')

tensor(4.5951, device='cuda:0')

tensor(0.0055, device='cuda:0')

tensor(0.1433, device='cuda:0')

tensor(0.0770, device='cuda:0')

tensor(0.2586, device='cuda:0')

tensor(0.0262, device='cuda:0')

tensor(0.1085, device='cuda:0')

tensor(0.8562, device='cuda:0')

tensor(0.0157, device='cuda:0')

tensor(0.1976, device='cuda:0')

tensor(0.0215, device='cuda:0')

tensor(5.8218, device='cuda:0')

tensor(0.3498, device='cuda:0')

tensor(3.4766, device='cuda:0')

tensor(0.0906, device='cuda:0')

tensor(0.0478, device='cuda:0')

tensor(0.1394, device='cuda:0')

tensor(9.0695, device='cuda:0')

tensor(0.0052, device='cuda:0')

tensor(4.0946, device='cuda:0')

tensor(0.0433, device='cuda:0')

tensor(0.1043, device='cuda:0')

tensor(0.1971, device='cuda:0')

tensor(10.0524, device='cuda:0')

  Batch   480  of    983.    Elapsed: 0:09:38.


tensor(6.6622, device='cuda:0')

tensor(0.0929, device='cuda:0')

tensor(0.0619, device='cuda:0')

tensor(1.2695, device='cuda:0')

tensor(0.0861, device='cuda:0')

tensor(0.3010, device='cuda:0')

tensor(0.2870, device='cuda:0')

tensor(0.0186, device='cuda:0')

tensor(0.3821, device='cuda:0')

tensor(0.0320, device='cuda:0')

tensor(0.3069, device='cuda:0')

tensor(0.1758, device='cuda:0')

tensor(0.0928, device='cuda:0')

tensor(0.0396, device='cuda:0')

tensor(0.0733, device='cuda:0')

tensor(0.0936, device='cuda:0')

tensor(0.0185, device='cuda:0')

tensor(4.5108, device='cuda:0')

tensor(4.8893, device='cuda:0')

tensor(0.2485, device='cuda:0')

tensor(0.2875, device='cuda:0')

tensor(3.8803, device='cuda:0')

tensor(11.4536, device='cuda:0')

tensor(4.1770, device='cuda:0')

tensor(6.7572, device='cuda:0')

tensor(0.1022, device='cuda:0')

tensor(0.0279, device='cuda:0')

tensor(0.0378, device='cuda:0')

tensor(1.8500, device='cuda:0')

tensor(0.2756, device='cuda:0')

tensor(1.5106, device='cuda:0')

tensor(0.0372, device='cuda:0')

tensor(0.2634, device='cuda:0')

tensor(6.9432, device='cuda:0')

tensor(0.7942, device='cuda:0')

tensor(8.2064, device='cuda:0')

tensor(0.0148, device='cuda:0')

tensor(0.0582, device='cuda:0')

tensor(19.8312, device='cuda:0')

tensor(0.0248, device='cuda:0')

  Batch   520  of    983.    Elapsed: 0:10:26.


tensor(0.0630, device='cuda:0')

tensor(0.1282, device='cuda:0')

tensor(0.3322, device='cuda:0')

tensor(0.0225, device='cuda:0')

tensor(0.0669, device='cuda:0')

tensor(10.3453, device='cuda:0')

tensor(4.4888, device='cuda:0')

tensor(0.0307, device='cuda:0')

tensor(0.0572, device='cuda:0')

tensor(0.0247, device='cuda:0')

tensor(4.3332, device='cuda:0')

tensor(0.0381, device='cuda:0')

tensor(2.6448, device='cuda:0')

tensor(0.1059, device='cuda:0')

tensor(0.0382, device='cuda:0')

tensor(0.0174, device='cuda:0')

tensor(10.5026, device='cuda:0')

tensor(0.0577, device='cuda:0')

tensor(0.0101, device='cuda:0')

tensor(10.2613, device='cuda:0')

tensor(0.1327, device='cuda:0')

tensor(0.1458, device='cuda:0')

tensor(0.0288, device='cuda:0')

tensor(10.5729, device='cuda:0')

tensor(9.1993, device='cuda:0')

tensor(13.1935, device='cuda:0')

tensor(0.0245, device='cuda:0')

tensor(0.5391, device='cuda:0')

tensor(2.3970, device='cuda:0')

tensor(0.1163, device='cuda:0')

tensor(0.2172, device='cuda:0')

tensor(0.0235, device='cuda:0')

tensor(10.2812, device='cuda:0')

tensor(1.2750, device='cuda:0')

tensor(0.7326, device='cuda:0')

tensor(0.1254, device='cuda:0')

tensor(12.2786, device='cuda:0')

tensor(3.9045, device='cuda:0')

tensor(0.1023, device='cuda:0')

tensor(0.0164, device='cuda:0')

  Batch   560  of    983.    Elapsed: 0:11:15.


tensor(12.1131, device='cuda:0')

tensor(0.0916, device='cuda:0')

tensor(0.1366, device='cuda:0')

tensor(2.5508, device='cuda:0')

tensor(3.2032, device='cuda:0')

tensor(0.0349, device='cuda:0')

tensor(0.0852, device='cuda:0')

tensor(0.0174, device='cuda:0')

tensor(0.1162, device='cuda:0')

tensor(0.0348, device='cuda:0')

tensor(0.0920, device='cuda:0')

tensor(7.6784, device='cuda:0')

tensor(14.8990, device='cuda:0')

tensor(3.0393, device='cuda:0')

tensor(0.2434, device='cuda:0')

tensor(7.6271, device='cuda:0')

tensor(0.0376, device='cuda:0')

tensor(8.8051, device='cuda:0')

tensor(1.4992, device='cuda:0')

tensor(2.2257, device='cuda:0')

tensor(0.1042, device='cuda:0')

tensor(1.5356, device='cuda:0')

tensor(0.0247, device='cuda:0')

tensor(0.0499, device='cuda:0')

tensor(0.3781, device='cuda:0')

tensor(0.0302, device='cuda:0')

tensor(0.1249, device='cuda:0')

tensor(0.2024, device='cuda:0')

tensor(0.1542, device='cuda:0')

tensor(4.2644, device='cuda:0')

tensor(0.1347, device='cuda:0')

tensor(5.4881, device='cuda:0')

tensor(0.0487, device='cuda:0')

tensor(0.0482, device='cuda:0')

tensor(0.1211, device='cuda:0')

tensor(0.1235, device='cuda:0')

tensor(0.0890, device='cuda:0')

tensor(0.1270, device='cuda:0')

tensor(0.1388, device='cuda:0')

tensor(0.2502, device='cuda:0')

  Batch   600  of    983.    Elapsed: 0:12:03.


tensor(0.0497, device='cuda:0')

tensor(1.2398, device='cuda:0')

tensor(0.1661, device='cuda:0')

tensor(0.3161, device='cuda:0')

tensor(0.1066, device='cuda:0')

tensor(0.2458, device='cuda:0')

tensor(0.0420, device='cuda:0')

tensor(9.2708, device='cuda:0')

tensor(0.0743, device='cuda:0')

tensor(0.3801, device='cuda:0')

tensor(0.0981, device='cuda:0')

tensor(0.0758, device='cuda:0')

tensor(0.3657, device='cuda:0')

tensor(0.2273, device='cuda:0')

tensor(12.4772, device='cuda:0')

tensor(0.0558, device='cuda:0')

tensor(0.1512, device='cuda:0')

tensor(17.4604, device='cuda:0')

tensor(1.0494, device='cuda:0')

tensor(0.1163, device='cuda:0')

tensor(0.0555, device='cuda:0')

tensor(10.7337, device='cuda:0')

tensor(4.8967, device='cuda:0')

tensor(0.9731, device='cuda:0')

tensor(0.2741, device='cuda:0')

tensor(0.1396, device='cuda:0')

tensor(5.0262, device='cuda:0')

tensor(0.0324, device='cuda:0')

tensor(0.0125, device='cuda:0')

tensor(4.3465, device='cuda:0')

tensor(0.0463, device='cuda:0')

tensor(3.3855, device='cuda:0')

tensor(0.4408, device='cuda:0')

tensor(0.0442, device='cuda:0')

tensor(0.0620, device='cuda:0')

tensor(0.5051, device='cuda:0')

tensor(0.0297, device='cuda:0')

tensor(0.4885, device='cuda:0')

tensor(0.0081, device='cuda:0')

tensor(0.0628, device='cuda:0')

  Batch   640  of    983.    Elapsed: 0:12:51.


tensor(0.0541, device='cuda:0')

tensor(0.0314, device='cuda:0')

tensor(0.0253, device='cuda:0')

tensor(0.0129, device='cuda:0')

tensor(0.0533, device='cuda:0')

tensor(0.1025, device='cuda:0')

tensor(0.0625, device='cuda:0')

tensor(0.0106, device='cuda:0')

tensor(0.8008, device='cuda:0')

tensor(0.7629, device='cuda:0')

tensor(1.6508, device='cuda:0')

tensor(0.0592, device='cuda:0')

tensor(0.0559, device='cuda:0')

tensor(0.0588, device='cuda:0')

tensor(5.7592, device='cuda:0')

tensor(0.0571, device='cuda:0')

tensor(0.1959, device='cuda:0')

tensor(0.0823, device='cuda:0')

tensor(0.0045, device='cuda:0')

tensor(0.0102, device='cuda:0')

tensor(0.0198, device='cuda:0')

tensor(5.0783, device='cuda:0')

tensor(0.0146, device='cuda:0')

tensor(0.2498, device='cuda:0')

tensor(0.1165, device='cuda:0')

tensor(0.0138, device='cuda:0')

tensor(0.0491, device='cuda:0')

tensor(9.6081, device='cuda:0')

tensor(10.0221, device='cuda:0')

tensor(6.7293, device='cuda:0')

tensor(0.0238, device='cuda:0')

tensor(0.0630, device='cuda:0')

tensor(0.0256, device='cuda:0')

tensor(0.0782, device='cuda:0')

tensor(0.2151, device='cuda:0')

tensor(0.4844, device='cuda:0')

tensor(0.1921, device='cuda:0')

tensor(0.0086, device='cuda:0')

tensor(0.0441, device='cuda:0')

tensor(7.7801, device='cuda:0')

  Batch   680  of    983.    Elapsed: 0:13:39.


tensor(5.4468, device='cuda:0')

tensor(0.0788, device='cuda:0')

tensor(0.0488, device='cuda:0')

tensor(0.0336, device='cuda:0')

tensor(0.1479, device='cuda:0')

tensor(0.0575, device='cuda:0')

tensor(0.0100, device='cuda:0')

tensor(0.0215, device='cuda:0')

tensor(0.0792, device='cuda:0')

tensor(0.0132, device='cuda:0')

tensor(4.2009, device='cuda:0')

tensor(0.1188, device='cuda:0')

tensor(0.0168, device='cuda:0')

tensor(4.6901, device='cuda:0')

tensor(0.0598, device='cuda:0')

tensor(2.2893, device='cuda:0')

tensor(0.1303, device='cuda:0')

tensor(0.0570, device='cuda:0')

tensor(0.2357, device='cuda:0')

tensor(0.3886, device='cuda:0')

tensor(0.0304, device='cuda:0')

tensor(0.0161, device='cuda:0')

tensor(0.4149, device='cuda:0')

tensor(0.0140, device='cuda:0')

tensor(0.0251, device='cuda:0')

tensor(0.1078, device='cuda:0')

tensor(3.5029, device='cuda:0')

tensor(0.9851, device='cuda:0')

tensor(0.1220, device='cuda:0')

tensor(0.0849, device='cuda:0')

tensor(5.6643, device='cuda:0')

tensor(0.0219, device='cuda:0')

tensor(4.9330, device='cuda:0')

tensor(4.8123, device='cuda:0')

tensor(13.0051, device='cuda:0')

tensor(0.1520, device='cuda:0')

tensor(0.1039, device='cuda:0')

tensor(6.7085, device='cuda:0')

tensor(0.1185, device='cuda:0')

tensor(0.0409, device='cuda:0')

  Batch   720  of    983.    Elapsed: 0:14:27.


tensor(0.0134, device='cuda:0')

tensor(0.0115, device='cuda:0')

tensor(6.9230, device='cuda:0')

tensor(0.2916, device='cuda:0')

tensor(5.1284, device='cuda:0')

tensor(0.1036, device='cuda:0')

tensor(5.2736, device='cuda:0')

tensor(0.0203, device='cuda:0')

tensor(0.0834, device='cuda:0')

tensor(0.0271, device='cuda:0')

tensor(5.1099, device='cuda:0')

tensor(3.5030, device='cuda:0')

tensor(0.0114, device='cuda:0')

tensor(11.1214, device='cuda:0')

tensor(0.0685, device='cuda:0')

tensor(11.9144, device='cuda:0')

tensor(0.0897, device='cuda:0')

tensor(0.0183, device='cuda:0')

tensor(0.1481, device='cuda:0')

tensor(0.0264, device='cuda:0')

tensor(0.0081, device='cuda:0')

tensor(0.0635, device='cuda:0')

tensor(0.0395, device='cuda:0')

tensor(1.1868, device='cuda:0')

tensor(0.1096, device='cuda:0')

tensor(0.2476, device='cuda:0')

tensor(6.4859, device='cuda:0')

tensor(0.0040, device='cuda:0')

tensor(0.0528, device='cuda:0')

tensor(0.0365, device='cuda:0')

tensor(0.0874, device='cuda:0')

tensor(0.0442, device='cuda:0')

tensor(32.9775, device='cuda:0')

tensor(0.2858, device='cuda:0')

tensor(0.3618, device='cuda:0')

tensor(11.0683, device='cuda:0')

tensor(7.5232, device='cuda:0')

tensor(2.5059, device='cuda:0')

tensor(2.5574, device='cuda:0')

tensor(7.3587, device='cuda:0')

  Batch   760  of    983.    Elapsed: 0:15:16.


tensor(0.0077, device='cuda:0')

tensor(0.1053, device='cuda:0')

tensor(0.0565, device='cuda:0')

tensor(0.0114, device='cuda:0')

tensor(0.0764, device='cuda:0')

tensor(0.0061, device='cuda:0')

tensor(0.0950, device='cuda:0')

tensor(0.0084, device='cuda:0')

tensor(0.0303, device='cuda:0')

tensor(18.6337, device='cuda:0')

tensor(0.0317, device='cuda:0')

tensor(0.0083, device='cuda:0')

tensor(6.1891, device='cuda:0')

tensor(0.0037, device='cuda:0')

tensor(0.0944, device='cuda:0')

tensor(0.6881, device='cuda:0')

tensor(5.5349, device='cuda:0')

tensor(0.3993, device='cuda:0')

tensor(4.3705, device='cuda:0')

tensor(0.0188, device='cuda:0')

tensor(5.9993, device='cuda:0')

tensor(0.2101, device='cuda:0')

tensor(5.4999, device='cuda:0')

tensor(3.4014, device='cuda:0')

tensor(9.4717, device='cuda:0')

tensor(1.4336, device='cuda:0')

tensor(2.6207, device='cuda:0')

tensor(0.0303, device='cuda:0')

tensor(0.0380, device='cuda:0')

tensor(0.0468, device='cuda:0')

tensor(0.0272, device='cuda:0')

tensor(0.0253, device='cuda:0')

tensor(0.0107, device='cuda:0')

tensor(0.0149, device='cuda:0')

tensor(0.1638, device='cuda:0')

tensor(0.1406, device='cuda:0')

tensor(0.2804, device='cuda:0')

tensor(4.8563, device='cuda:0')

tensor(6.2766, device='cuda:0')

tensor(3.9155, device='cuda:0')

  Batch   800  of    983.    Elapsed: 0:16:04.


tensor(0.0124, device='cuda:0')

tensor(10.1255, device='cuda:0')

tensor(0.4201, device='cuda:0')

tensor(0.1261, device='cuda:0')

tensor(7.2207, device='cuda:0')

tensor(2.0140, device='cuda:0')

tensor(0.0385, device='cuda:0')

tensor(0.0046, device='cuda:0')

tensor(0.4422, device='cuda:0')

tensor(0.0260, device='cuda:0')

tensor(0.3263, device='cuda:0')

tensor(0.1102, device='cuda:0')

tensor(0.1465, device='cuda:0')

tensor(0.1500, device='cuda:0')

tensor(14.7336, device='cuda:0')

tensor(5.3441, device='cuda:0')

tensor(3.7614, device='cuda:0')

tensor(0.8521, device='cuda:0')

tensor(0.0333, device='cuda:0')

tensor(0.0041, device='cuda:0')

tensor(0.0153, device='cuda:0')

tensor(0.0532, device='cuda:0')

tensor(2.1012, device='cuda:0')

tensor(0.3115, device='cuda:0')

tensor(0.0157, device='cuda:0')

tensor(0.0343, device='cuda:0')

tensor(0.0611, device='cuda:0')

tensor(0.0797, device='cuda:0')

tensor(8.8606, device='cuda:0')

tensor(0.0074, device='cuda:0')

tensor(0.1098, device='cuda:0')

tensor(3.3905, device='cuda:0')

tensor(2.9226, device='cuda:0')

tensor(2.4448, device='cuda:0')

tensor(0.0309, device='cuda:0')

tensor(7.5969, device='cuda:0')

tensor(10.5638, device='cuda:0')

tensor(0.0288, device='cuda:0')

tensor(0.0262, device='cuda:0')

tensor(0.3583, device='cuda:0')

  Batch   840  of    983.    Elapsed: 0:16:52.


tensor(0.0347, device='cuda:0')

tensor(0.8574, device='cuda:0')

tensor(0.0119, device='cuda:0')

tensor(0.8329, device='cuda:0')

tensor(0.0154, device='cuda:0')

tensor(0.0346, device='cuda:0')

tensor(0.0931, device='cuda:0')

tensor(0.0368, device='cuda:0')

tensor(0.0467, device='cuda:0')

tensor(0.0270, device='cuda:0')

tensor(5.0393, device='cuda:0')

tensor(0.2114, device='cuda:0')

tensor(14.7312, device='cuda:0')

tensor(0.0189, device='cuda:0')

tensor(0.0131, device='cuda:0')

tensor(0.0384, device='cuda:0')

tensor(0.3231, device='cuda:0')

tensor(7.4320, device='cuda:0')

tensor(0.0480, device='cuda:0')

tensor(11.9362, device='cuda:0')

tensor(5.4209, device='cuda:0')

tensor(0.0717, device='cuda:0')

tensor(0.0340, device='cuda:0')

tensor(6.5375, device='cuda:0')

tensor(3.0932, device='cuda:0')

tensor(0.2091, device='cuda:0')

tensor(0.0300, device='cuda:0')

tensor(0.3041, device='cuda:0')

tensor(0.0053, device='cuda:0')

tensor(0.0130, device='cuda:0')

tensor(8.2609, device='cuda:0')

tensor(0.0471, device='cuda:0')

tensor(0.2427, device='cuda:0')

tensor(0.0944, device='cuda:0')

tensor(0.0594, device='cuda:0')

tensor(7.9633, device='cuda:0')

tensor(9.9301, device='cuda:0')

tensor(0.1266, device='cuda:0')

tensor(11.1870, device='cuda:0')

tensor(0.1810, device='cuda:0')

  Batch   880  of    983.    Elapsed: 0:17:40.


tensor(3.6352, device='cuda:0')

tensor(5.0664, device='cuda:0')

tensor(7.4669, device='cuda:0')

tensor(0.0059, device='cuda:0')

tensor(2.6005, device='cuda:0')

tensor(0.1085, device='cuda:0')

tensor(0.0278, device='cuda:0')

tensor(0.0292, device='cuda:0')

tensor(0.1691, device='cuda:0')

tensor(0.0259, device='cuda:0')

tensor(16.1090, device='cuda:0')

tensor(0.0387, device='cuda:0')

tensor(9.7420, device='cuda:0')

tensor(0.0527, device='cuda:0')

tensor(8.5617, device='cuda:0')

tensor(12.1644, device='cuda:0')

tensor(0.0598, device='cuda:0')

tensor(0.0087, device='cuda:0')

tensor(0.1053, device='cuda:0')

tensor(4.9268, device='cuda:0')

tensor(6.1823, device='cuda:0')

tensor(0.5207, device='cuda:0')

tensor(0.0769, device='cuda:0')

tensor(0.1896, device='cuda:0')

tensor(0.1268, device='cuda:0')

tensor(0.0091, device='cuda:0')

tensor(1.0778, device='cuda:0')

tensor(0.0800, device='cuda:0')

tensor(4.1686, device='cuda:0')

tensor(0.2041, device='cuda:0')

tensor(2.6071, device='cuda:0')

tensor(0.2300, device='cuda:0')

tensor(0.0104, device='cuda:0')

tensor(0.1055, device='cuda:0')

tensor(0.1967, device='cuda:0')

tensor(0.0117, device='cuda:0')

tensor(0.0036, device='cuda:0')

tensor(0.0323, device='cuda:0')

tensor(10.2380, device='cuda:0')

tensor(0.0356, device='cuda:0')

  Batch   920  of    983.    Elapsed: 0:18:28.


tensor(0.0243, device='cuda:0')

tensor(2.9946, device='cuda:0')

tensor(0.1257, device='cuda:0')

tensor(0.1486, device='cuda:0')

tensor(0.0344, device='cuda:0')

tensor(0.0104, device='cuda:0')

tensor(0.0108, device='cuda:0')

tensor(6.6618, device='cuda:0')

tensor(0.2322, device='cuda:0')

tensor(7.3170, device='cuda:0')

tensor(1.9290, device='cuda:0')

tensor(0.1473, device='cuda:0')

tensor(0.1069, device='cuda:0')

tensor(16.3981, device='cuda:0')

tensor(0.0308, device='cuda:0')

tensor(8.6226, device='cuda:0')

tensor(0.1880, device='cuda:0')

tensor(0.0999, device='cuda:0')

tensor(0.2932, device='cuda:0')

tensor(9.0904, device='cuda:0')

tensor(0.0638, device='cuda:0')

tensor(0.0624, device='cuda:0')

tensor(0.0396, device='cuda:0')

tensor(0.0526, device='cuda:0')

tensor(0.1312, device='cuda:0')

tensor(2.5078, device='cuda:0')

tensor(10.8958, device='cuda:0')

tensor(0.0498, device='cuda:0')

tensor(0.0210, device='cuda:0')

tensor(0.1073, device='cuda:0')

tensor(0.0404, device='cuda:0')

tensor(5.1637, device='cuda:0')

tensor(9.4148, device='cuda:0')

tensor(0.1225, device='cuda:0')

tensor(0.0348, device='cuda:0')

tensor(0.0541, device='cuda:0')

tensor(0.2502, device='cuda:0')

tensor(0.5473, device='cuda:0')

tensor(5.2722, device='cuda:0')

tensor(0.0510, device='cuda:0')

  Batch   960  of    983.    Elapsed: 0:19:16.


tensor(0.0103, device='cuda:0')

tensor(0.0813, device='cuda:0')

tensor(4.3818, device='cuda:0')

tensor(0.1677, device='cuda:0')

tensor(0.0253, device='cuda:0')

tensor(0.0585, device='cuda:0')

tensor(0.0440, device='cuda:0')

tensor(4.9140, device='cuda:0')

tensor(1.1813, device='cuda:0')

tensor(0.3989, device='cuda:0')

tensor(0.0112, device='cuda:0')

tensor(8.1808, device='cuda:0')

tensor(10.6079, device='cuda:0')

tensor(7.8810, device='cuda:0')

tensor(0.2018, device='cuda:0')

tensor(0.0432, device='cuda:0')

tensor(0.5060, device='cuda:0')

tensor(0.0546, device='cuda:0')

tensor(1.4409, device='cuda:0')

tensor(0.0340, device='cuda:0')

tensor(0.0597, device='cuda:0')

tensor(0.1921, device='cuda:0')

tensor(0.0297, device='cuda:0')


  Average training loss: 0.03
  Training epcoh took: 0:19:43

Running Validation...


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

  Accuracy: 0.98
  Validation Loss: 0.06
  Validation took: 0:02:01

Training complete!
Total training took 0:43:28 (h:mm:ss)
