In [1]:
! python -V

Python 3.9.7


In [6]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 626 kB/s 
Collecting filelock
  Downloading filelock-3.3.2-py3-none-any.whl (9.7 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (661 kB)
[K     |████████████████████████████████| 661 kB 9.8 MB/s 
[?25hCollecting regex!=2019.12.17
  Downloading regex-2021.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)
[K     |████████████████████████████████| 762 kB 9.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 4.7 MB/s 
[?25hCollecting requests
  Downloading requests-2.26.0-py2.py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 712 kB/s 
[?25hCollecting tqdm>=4.27
  Downloading tqdm-4.62.3-py2.py3-none-a

In [1]:
import random 
import time
import copy

import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel, AdamW
from transformers import RobertaTokenizer, RobertaModel

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import Dataset, DataLoader
from word_aug import get_aug_dataset


[nltk_data] Downloading package punkt to /home/andrew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
torch.cuda.is_available()

True

In [3]:
#CHECKPOINT = "sberbank-ai/sbert_large_nlu_ru"
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-05
#tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
CHECKPOINT = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(CHECKPOINT)

def get_individual_labels(df):
    labels = pd.get_dummies(df.label).rename({
        "ДЖОУИ": "Joey", "МОНИКА": "Monica", "РЕЙЧЕЛ": "Rachel", "РОСС": "Ross", 
        "ФИБИ": "Phoebe", "ЧЕНДЛЕР": "Chandler"
    }, axis=1)
    return pd.concat([df, labels], axis=1)

In [4]:
%cd friends-classification/
!mkdir models
! ls

/home/andrew/ml/friends-classification
mkdir: cannot create directory ‘models’: File exists
english
example_eng_ru.csv
fb_model_translate_en_ru_2_friend_response_test.csv
fb_model_translate_en_ru_2_friend_response_train.csv
fb_model_translate_en_ru_2_friend_response_val.csv
fb_model_translate_en_ru_2_other_speaker_test.csv
fb_model_translate_en_ru_2_other_speaker_train.csv
fb_model_translate_en_ru_2_other_speaker_val.csv
fb_model_translate_ru_en_1_friend_response_test.csv
fb_model_translate_ru_en_1_friend_response_train.csv
fb_model_translate_ru_en_1_friend_response_val.csv
fb_model_translate_ru_en_1_other_speaker_test.csv
fb_model_translate_ru_en_1_other_speaker_train.csv
fb_model_translate_ru_en_1_other_speaker_val.csv
final_model.pt
final_submission.csv
helsinki_model_translate_ru_en_1_friend_response_test.csv
helsinki_model_translate_ru_en_1_friend_response_train.csv
helsinki_model_translate_ru_en_1_friend_response_val.csv
helsinki_model_translate_ru_en_1_other_speaker_test.csv
hel

In [5]:
df_train = pd.read_csv('english/df_train_eng.csv').rename({'Category': 'label'}, axis=1)
df_train.other_speaker.fillna('', inplace=True)
df_train.friend_response.fillna('', inplace=True)
df_val = pd.read_csv('english/df_val_eng.csv')
df_val.other_speaker.fillna('', inplace=True)
df_val.friend_response.fillna('', inplace=True)
df_test = pd.read_csv('english/df_test_eng.csv')
df_test.other_speaker.fillna('', inplace=True)
df_test.friend_response.fillna('', inplace=True)

df_train = get_individual_labels(df_train)
df_val = get_individual_labels(df_val)

# Encoding target variable
names_to_cats = LabelEncoder()
df_train['label_code'] = names_to_cats.fit_transform(df_train.label)
df_val['label_code'] = names_to_cats.transform(df_val.label)
df_fb_train = pd.read_csv('train_data_rus_fb_model.csv')
df_full = pd.concat([df_train, df_val])
print(df_train["label"].value_counts()/df_train.shape[0])
print()
print(df_val["label"].value_counts()/df_val.shape[0])

РОСС       0.176569
РЕЙЧЕЛ     0.176089
ЧЕНДЛЕР    0.170568
ДЖОУИ      0.166287
МОНИКА     0.160525
ФИБИ       0.149962
Name: label, dtype: float64

РОСС       0.176746
РЕЙЧЕЛ     0.176026
ЧЕНДЛЕР    0.170626
ДЖОУИ      0.166307
МОНИКА     0.160547
ФИБИ       0.149748
Name: label, dtype: float64


In [6]:
class FriendsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, padding='max_length', 
                 with_labels=True):

        self.dataframe = dataframe  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = tokenizer  
        self.padding = padding
        self.max_length = max_length
        
        self.with_labels = with_labels 
        if 'label' not in self.dataframe.columns:
          self.with_labels = False

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = self.dataframe.other_speaker.iloc[index]
        sent2 = self.dataframe.friend_response.iloc[index]

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding=self.padding,  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.max_length,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.dataframe.label_code.iloc[index]
            #label = self.dataframe.Phoebe.iloc[index]
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [7]:
class SentencePairClassifier(nn.Module):

    def __init__(self, model=CHECKPOINT, freeze_model=True):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        # self.pretrained_layer = AutoModel.from_pretrained(CHECKPOINT)
        self.pretrained_layer = BertModel.from_pretrained(CHECKPOINT)
        #self.pretrained_layer = RobertaModel.from_pretrained(CHECKPOINT)

        hidden_size = self.pretrained_layer.config.hidden_size

        # Freeze model layers and only train the classification layer weights
        if freeze_model:
            for p in self.pretrained_layer.parameters():
                p.requires_grad = False
            print('All parameters frozen')
        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 6)

        self.dropout = nn.Dropout(p=0.3)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        output = self.pretrained_layer(input_ids, attn_masks, token_type_ids)

        # Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
        # Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
        # objective during pre-training.
        logits = self.cls_layer(self.dropout(output.pooler_output))

        return logits

In [8]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    #os.environ['PYTHONHASHSEED'] = str(seed)
    
@autocast()
def evaluate_loss(net, device, criterion, dataloader):
    net.eval()
    n_correct = 0
    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels).item()
            count += 1
            max_logits, argmax_idx = torch.max(logits.data, dim=1)
            n_correct += calcuate_accu(argmax_idx, labels)
    del logits
    return mean_loss / count, n_correct / len(dataloader.dataset)
  
# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [9]:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_acc = 0
    best_ep = 1
    n_iterations = len(train_loader)
    batch_size = train_loader.batch_size
    print_every = 1000 // batch_size  # print the training loss this many times per epoch
    print_eval_iters = 10000 // batch_size
    scaler = GradScaler()

    for ep in range(epochs):
        net.train()
        curr_loss = 0.0
        curr_n_correct = 0.
        trailing_loss = 0.
        trailing_n_correct = 0.
        curr_n_tr_examples = 0
        trainling_n_tr_examples = 0

        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
  
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                pooled = net(seq, attn_masks, token_type_ids)

                # Computing loss
                loss = criterion(pooled.squeeze(-1), labels)
                #print(loss, type(loss))
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged
                # Computing accuracy
                #print(pooled.squeeze(-1), labels)
                curr_loss += loss.item() 
                big_val, big_idx = torch.max(pooled.data, dim=1)
                n_correct = calcuate_accu(big_idx, labels)
                curr_n_correct += n_correct

            trailing_loss += loss.item() 
            trailing_n_correct += n_correct
            curr_n_tr_examples += labels.size(0)
            trainling_n_tr_examples += labels.size(0)

            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Batch {}/{} of epoch {} complete. Loss per last {} samples:: {} "
                      .format(it+1, n_iterations, ep+1, curr_n_tr_examples, curr_loss / print_every))
                accu_step = (curr_n_correct*100) / curr_n_tr_examples 
                #print(f"Training Loss per 5000 steps: {loss_step}")
                print(f"Training Accuracy per last {curr_n_tr_examples} samples: {accu_step}")
                curr_loss = 0.0
                curr_n_tr_examples = 0
                curr_n_correct = 0


            if (it + 1) % print_eval_iters == 0 or it ==  n_iterations - 1:
                del pooled, loss
                print("Epoch {}, batch {} complete! Training Loss : {}"
                .format(ep+1, it+1, trailing_loss / (it+1)))
                print("Epoch {}, batch {} complete! Training Accuracy : {}"
                .format(ep+1, it+1, trailing_n_correct / trainling_n_tr_examples))
                with autocast():
                    val_loss, val_accuracy = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
                #print()
                print("Epoch {}, batch {} complete! Validation Loss : {}".format(ep+1, it+1, val_loss))
                print("Epoch {}, batch {} complete! Validation Accuracy : {}".format(ep+1, it+1,val_accuracy))
                net.train()
                #if val_loss < best_loss:
                if val_accuracy > best_acc:
                    print("Validation loss changed from {} to {}".format(best_loss, val_loss))
                    print("Best validation accuracy improved from {} to {}".format(best_acc, val_accuracy))
                    print()
                    #net_copy = copy.deepcopy(net)  # save a copy of the model
                    best_loss = val_loss
                    best_acc = val_accuracy
                    best_ep = ep + 1
                    # Saving the model
                    path_to_model='models/{}_lr_{}_val_acc_{}_ep_{}.pt'.format(time.ctime(), lr, round(best_acc, 4), best_ep)
                    torch.save(net.state_dict(), path_to_model)
                    print("The model has been saved in {}".format(path_to_model))

    torch.cuda.empty_cache()

In [10]:
from transformers import get_linear_schedule_with_warmup
from transformers import get_constant_schedule
from sklearn.utils import compute_class_weight
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
#train_set = FriendsDataset(dataframe=df_train, tokenizer=tokenizer, max_length=MAX_LEN)
train_set = FriendsDataset(dataframe=df_full, tokenizer=tokenizer, max_length=MAX_LEN)

print("Reading validation data...")
val_set = FriendsDataset(dataframe=df_val, tokenizer=tokenizer, max_length=MAX_LEN)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=2)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(model=CHECKPOINT, freeze_model=False)
print(device)

if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

net.to(device)

"""class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(df_train.Phoebe), y=df_train.Phoebe)
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)"""
criterion = nn.CrossEntropyLoss()

opti = AdamW(net.parameters(), lr=LEARNING_RATE, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
iters_to_accumulate = 2
num_training_steps = EPOCHS * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * EPOCHS  # Necessary to take into account Gradient accumulation
#lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)
lr_scheduler = get_constant_schedule(optimizer=opti)


Reading training data...
Reading validation data...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cuda:0


In [11]:
# Training on all availible data
train_bert(net, criterion, opti, LEARNING_RATE, lr_scheduler, train_loader, val_loader, EPOCHS, iters_to_accumulate)

  3%|▎         | 57/1736 [00:39<19:24,  1.44it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  4%|▎         | 62/1736 [00:43<19:23,  1.44it/s]


Batch 62/1736 of epoch 1 complete. Loss per last 992 samples:: 0.9178887644121724 
Training Accuracy per last 992 samples: 17.842741935483872


  7%|▋         | 124/1736 [01:26<18:56,  1.42it/s]


Batch 124/1736 of epoch 1 complete. Loss per last 992 samples:: 0.9073033486643145 
Training Accuracy per last 992 samples: 16.129032258064516


 11%|█         | 186/1736 [02:09<18:00,  1.43it/s]


Batch 186/1736 of epoch 1 complete. Loss per last 992 samples:: 0.9004467379662299 
Training Accuracy per last 992 samples: 20.56451612903226


 14%|█▍        | 248/1736 [02:53<17:16,  1.44it/s]


Batch 248/1736 of epoch 1 complete. Loss per last 992 samples:: 0.9012170607043851 
Training Accuracy per last 992 samples: 19.052419354838708


 18%|█▊        | 310/1736 [03:36<16:49,  1.41it/s]


Batch 310/1736 of epoch 1 complete. Loss per last 992 samples:: 0.9055953487273185 
Training Accuracy per last 992 samples: 17.036290322580644


 21%|██▏       | 372/1736 [04:19<15:42,  1.45it/s]


Batch 372/1736 of epoch 1 complete. Loss per last 992 samples:: 0.9012470860635081 
Training Accuracy per last 992 samples: 18.14516129032258


 25%|██▌       | 434/1736 [05:03<15:12,  1.43it/s]


Batch 434/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8970258159022177 
Training Accuracy per last 992 samples: 19.556451612903224


 29%|██▊       | 496/1736 [05:46<14:25,  1.43it/s]


Batch 496/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8920076431766633 
Training Accuracy per last 992 samples: 19.858870967741936


 32%|███▏      | 558/1736 [06:30<13:38,  1.44it/s]


Batch 558/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8923851751512096 
Training Accuracy per last 992 samples: 18.548387096774192


 35%|███▍      | 599/1736 [06:58<13:04,  1.45it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 36%|███▌      | 620/1736 [07:13<13:05,  1.42it/s]


Batch 620/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8952678557365171 
Training Accuracy per last 992 samples: 19.052419354838708


 36%|███▌      | 624/1736 [07:16<13:02,  1.42it/s]

Epoch 1, batch 625 complete! Training Loss : 0.90090078125
Epoch 1, batch 625 complete! Training Accuracy : 0.186


100%|██████████| 174/174 [00:42<00:00,  4.12it/s]


Epoch 1, batch 625 complete! Validation Loss : 1.7678532744276112
Epoch 1, batch 625 complete! Validation Accuracy : 0.2361411087113031
Validation loss changed from inf to 1.7678532744276112
Best validation accuracy improved from 0 to 0.2361411087113031



 36%|███▌      | 625/1736 [07:59<4:11:52, 13.60s/it]

The model has been saved in models/Thu Nov 25 01:41:25 2021_lr_2e-05_val_acc_0.2361_ep_1.pt


 39%|███▉      | 682/1736 [08:39<11:48,  1.49it/s]


Batch 682/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8871952179939516 
Training Accuracy per last 992 samples: 19.95967741935484


 39%|███▉      | 683/1736 [08:40<11:54,  1.47it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 43%|████▎     | 744/1736 [09:19<10:26,  1.58it/s]


Batch 744/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8869520618069556 
Training Accuracy per last 992 samples: 21.27016129032258


 46%|████▋     | 806/1736 [10:00<10:13,  1.52it/s]


Batch 806/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8788038684475806 
Training Accuracy per last 992 samples: 24.39516129032258


 50%|█████     | 868/1736 [10:40<09:48,  1.47it/s]


Batch 868/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8805738879788306 
Training Accuracy per last 992 samples: 22.47983870967742


 54%|█████▎    | 930/1736 [11:20<08:27,  1.59it/s]


Batch 930/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8839281143680695 
Training Accuracy per last 992 samples: 21.975806451612904


 57%|█████▋    | 992/1736 [11:59<07:52,  1.57it/s]


Batch 992/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8799002862745716 
Training Accuracy per last 992 samples: 22.883064516129032


 61%|██████    | 1054/1736 [12:38<07:10,  1.58it/s]


Batch 1054/1736 of epoch 1 complete. Loss per last 992 samples:: 0.867401369156376 
Training Accuracy per last 992 samples: 23.891129032258064


 64%|██████▍   | 1116/1736 [13:19<06:30,  1.59it/s]


Batch 1116/1736 of epoch 1 complete. Loss per last 992 samples:: 0.861328371109501 
Training Accuracy per last 992 samples: 26.20967741935484


 68%|██████▊   | 1178/1736 [13:58<05:52,  1.58it/s]


Batch 1178/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8709022768082157 
Training Accuracy per last 992 samples: 24.798387096774192


 71%|███████▏  | 1240/1736 [14:39<05:45,  1.44it/s]


Batch 1240/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8571747810609879 
Training Accuracy per last 992 samples: 27.318548387096776


 72%|███████▏  | 1249/1736 [14:45<05:36,  1.45it/s]

Epoch 1, batch 1250 complete! Training Loss : 0.8880179809570312
Epoch 1, batch 1250 complete! Training Accuracy : 0.2109


100%|██████████| 174/174 [00:39<00:00,  4.41it/s]


Epoch 1, batch 1250 complete! Validation Loss : 1.6848887828574783
Epoch 1, batch 1250 complete! Validation Accuracy : 0.28725701943844495
Validation loss changed from 1.7678532744276112 to 1.6848887828574783
Best validation accuracy improved from 0.2361411087113031 to 0.28725701943844495



 72%|███████▏  | 1250/1736 [15:26<1:43:34, 12.79s/it]

The model has been saved in models/Thu Nov 25 01:48:52 2021_lr_2e-05_val_acc_0.2873_ep_1.pt


 75%|███████▌  | 1302/1736 [16:00<04:35,  1.57it/s]


Batch 1302/1736 of epoch 1 complete. Loss per last 992 samples:: 0.860778316374748 
Training Accuracy per last 992 samples: 27.116935483870968


 79%|███████▊  | 1364/1736 [16:39<03:54,  1.59it/s]


Batch 1364/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8420511061145414 
Training Accuracy per last 992 samples: 28.830645161290324


 80%|███████▉  | 1385/1736 [16:52<03:39,  1.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 82%|████████▏ | 1426/1736 [17:17<03:15,  1.59it/s]


Batch 1426/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8563557286416331 
Training Accuracy per last 992 samples: 27.116935483870968


 86%|████████▌ | 1488/1736 [17:56<02:36,  1.59it/s]


Batch 1488/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8507258507513231 
Training Accuracy per last 992 samples: 28.326612903225808


 89%|████████▉ | 1550/1736 [18:35<01:57,  1.59it/s]


Batch 1550/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8475580523090978 
Training Accuracy per last 992 samples: 29.737903225806452


 93%|█████████▎| 1612/1736 [19:14<01:17,  1.59it/s]


Batch 1612/1736 of epoch 1 complete. Loss per last 992 samples:: 0.8457604685137349 
Training Accuracy per last 992 samples: 32.15725806451613


 96%|█████████▋| 1674/1736 [19:53<00:39,  1.59it/s]


Batch 1674/1736 of epoch 1 complete. Loss per last 992 samples:: 0.831741456062563 
Training Accuracy per last 992 samples: 30.342741935483872


100%|█████████▉| 1735/1736 [20:32<00:00,  1.60it/s]


Batch 1736/1736 of epoch 1 complete. Loss per last 987 samples:: 0.8162143739961809 
Training Accuracy per last 987 samples: 32.21884498480243
Epoch 1, batch 1736 complete! Training Loss : 0.875561800866907
Epoch 1, batch 1736 complete! Training Accuracy : 0.23456123294083756


100%|██████████| 174/174 [00:38<00:00,  4.56it/s]


Epoch 1, batch 1736 complete! Validation Loss : 1.5834934279836457
Epoch 1, batch 1736 complete! Validation Accuracy : 0.3624910007199424
Validation loss changed from 1.6848887828574783 to 1.5834934279836457
Best validation accuracy improved from 0.28725701943844495 to 0.3624910007199424



100%|██████████| 1736/1736 [21:11<00:00,  1.37it/s]


The model has been saved in models/Thu Nov 25 01:54:36 2021_lr_2e-05_val_acc_0.3625_ep_1.pt


  4%|▎         | 62/1736 [00:39<17:34,  1.59it/s]


Batch 62/1736 of epoch 2 complete. Loss per last 992 samples:: 0.8000272935436618 
Training Accuracy per last 992 samples: 34.274193548387096


  7%|▋         | 124/1736 [01:17<16:59,  1.58it/s]


Batch 124/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7992842274327432 
Training Accuracy per last 992 samples: 36.59274193548387


 11%|█         | 186/1736 [01:56<16:15,  1.59it/s]


Batch 186/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7878429658951298 
Training Accuracy per last 992 samples: 34.778225806451616


 14%|█▍        | 248/1736 [02:35<15:35,  1.59it/s]


Batch 248/1736 of epoch 2 complete. Loss per last 992 samples:: 0.8054039862848097 
Training Accuracy per last 992 samples: 34.57661290322581


 18%|█▊        | 310/1736 [03:14<14:56,  1.59it/s]


Batch 310/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7902487477948589 
Training Accuracy per last 992 samples: 34.57661290322581


 18%|█▊        | 321/1736 [03:21<14:45,  1.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██        | 368/1736 [03:51<14:21,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██▏       | 372/1736 [03:53<14:20,  1.59it/s]


Batch 372/1736 of epoch 2 complete. Loss per last 992 samples:: 0.8074448493219191 
Training Accuracy per last 992 samples: 35.28225806451613


 25%|██▌       | 434/1736 [04:32<13:39,  1.59it/s]


Batch 434/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7980702307916456 
Training Accuracy per last 992 samples: 36.189516129032256


 28%|██▊       | 478/1736 [05:00<13:12,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 29%|██▊       | 496/1736 [05:12<13:21,  1.55it/s]


Batch 496/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7881771825974987 
Training Accuracy per last 992 samples: 34.475806451612904


 32%|███▏      | 558/1736 [05:51<12:21,  1.59it/s]


Batch 558/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7779814812444872 
Training Accuracy per last 992 samples: 36.895161290322584


 36%|███▌      | 620/1736 [06:30<11:42,  1.59it/s]


Batch 620/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7796370290940807 
Training Accuracy per last 992 samples: 36.49193548387097


 36%|███▌      | 624/1736 [06:32<11:38,  1.59it/s]

Epoch 2, batch 625 complete! Training Loss : 0.7937084930419922
Epoch 2, batch 625 complete! Training Accuracy : 0.3538


100%|██████████| 174/174 [00:38<00:00,  4.57it/s]


Epoch 2, batch 625 complete! Validation Loss : 1.4725390816556996
Epoch 2, batch 625 complete! Validation Accuracy : 0.4272858171346292
Validation loss changed from 1.5834934279836457 to 1.4725390816556996
Best validation accuracy improved from 0.3624910007199424 to 0.4272858171346292



 36%|███▌      | 625/1736 [07:12<3:47:23, 12.28s/it]

The model has been saved in models/Thu Nov 25 02:01:49 2021_lr_2e-05_val_acc_0.4273_ep_2.pt


 39%|███▉      | 682/1736 [07:48<11:03,  1.59it/s]


Batch 682/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7766850379205519 
Training Accuracy per last 992 samples: 36.99596774193548


 43%|████▎     | 744/1736 [08:27<10:25,  1.59it/s]


Batch 744/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7861016796481225 
Training Accuracy per last 992 samples: 34.87903225806452


 46%|████▋     | 806/1736 [09:06<10:11,  1.52it/s]


Batch 806/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7747120395783456 
Training Accuracy per last 992 samples: 38.104838709677416


 50%|█████     | 868/1736 [09:45<09:07,  1.59it/s]


Batch 868/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7909242568477508 
Training Accuracy per last 992 samples: 34.67741935483871


 54%|█████▎    | 930/1736 [10:24<08:27,  1.59it/s]


Batch 930/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7803074313748267 
Training Accuracy per last 992 samples: 36.895161290322584


 57%|█████▋    | 992/1736 [11:03<07:48,  1.59it/s]


Batch 992/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7687013995262885 
Training Accuracy per last 992 samples: 38.70967741935484


 61%|██████    | 1054/1736 [11:42<07:09,  1.59it/s]


Batch 1054/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7762295199978736 
Training Accuracy per last 992 samples: 36.49193548387097


 64%|██████▍   | 1116/1736 [12:21<06:30,  1.59it/s]


Batch 1116/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7791134926580614 
Training Accuracy per last 992 samples: 37.903225806451616


 68%|██████▊   | 1178/1736 [13:00<05:51,  1.59it/s]


Batch 1178/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7670601875551285 
Training Accuracy per last 992 samples: 39.11290322580645


 71%|███████▏  | 1240/1736 [13:39<05:12,  1.59it/s]


Batch 1240/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7768972766014838 
Training Accuracy per last 992 samples: 37.903225806451616


 72%|███████▏  | 1249/1736 [13:45<05:36,  1.45it/s]

Epoch 2, batch 1250 complete! Training Loss : 0.7850242980957032
Epoch 2, batch 1250 complete! Training Accuracy : 0.3634


100%|██████████| 174/174 [00:39<00:00,  4.41it/s]


Epoch 2, batch 1250 complete! Validation Loss : 1.3773712159573346
Epoch 2, batch 1250 complete! Validation Accuracy : 0.46184305255579555
Validation loss changed from 1.4725390816556996 to 1.3773712159573346
Best validation accuracy improved from 0.4272858171346292 to 0.46184305255579555



 72%|███████▏  | 1250/1736 [14:26<1:43:25, 12.77s/it]

The model has been saved in models/Thu Nov 25 02:09:03 2021_lr_2e-05_val_acc_0.4618_ep_2.pt


 75%|███████▌  | 1302/1736 [15:01<04:33,  1.59it/s]


Batch 1302/1736 of epoch 2 complete. Loss per last 992 samples:: 0.748767206745763 
Training Accuracy per last 992 samples: 40.32258064516129


 79%|███████▊  | 1364/1736 [15:41<04:09,  1.49it/s]


Batch 1364/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7647725382158833 
Training Accuracy per last 992 samples: 36.189516129032256


 82%|████████▏ | 1426/1736 [16:22<03:26,  1.50it/s]


Batch 1426/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7705930586784117 
Training Accuracy per last 992 samples: 37.5


 86%|████████▌ | 1488/1736 [17:02<02:38,  1.57it/s]


Batch 1488/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7679192327683971 
Training Accuracy per last 992 samples: 37.096774193548384


 86%|████████▌ | 1495/1736 [17:06<02:33,  1.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 89%|████████▉ | 1550/1736 [17:41<01:58,  1.57it/s]


Batch 1550/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7555833631946195 
Training Accuracy per last 992 samples: 39.11290322580645


 93%|█████████▎| 1612/1736 [18:21<01:19,  1.55it/s]


Batch 1612/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7504806826191563 
Training Accuracy per last 992 samples: 39.818548387096776


 96%|█████████▋| 1674/1736 [19:01<00:39,  1.56it/s]


Batch 1674/1736 of epoch 2 complete. Loss per last 992 samples:: 0.7405143860847719 
Training Accuracy per last 992 samples: 39.516129032258064


100%|█████████▉| 1735/1736 [19:40<00:00,  1.58it/s]


Batch 1736/1736 of epoch 2 complete. Loss per last 987 samples:: 0.7672035963304581 
Training Accuracy per last 987 samples: 38.90577507598784
Epoch 2, batch 1736 complete! Training Loss : 0.777738727869526
Epoch 2, batch 1736 complete! Training Accuracy : 0.36937812826329625


100%|██████████| 174/174 [00:38<00:00,  4.55it/s]


Epoch 2, batch 1736 complete! Validation Loss : 1.3173391428487053
Epoch 2, batch 1736 complete! Validation Accuracy : 0.49388048956083513
Validation loss changed from 1.3773712159573346 to 1.3173391428487053
Best validation accuracy improved from 0.46184305255579555 to 0.49388048956083513



100%|██████████| 1736/1736 [20:20<00:00,  1.42it/s]


The model has been saved in models/Thu Nov 25 02:14:56 2021_lr_2e-05_val_acc_0.4939_ep_2.pt


  4%|▎         | 62/1736 [00:39<17:36,  1.58it/s]


Batch 62/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6879654699756254 
Training Accuracy per last 992 samples: 45.96774193548387


  6%|▋         | 110/1736 [01:09<17:07,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  7%|▋         | 124/1736 [01:18<17:00,  1.58it/s]


Batch 124/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6506985695131363 
Training Accuracy per last 992 samples: 49.899193548387096


 11%|█         | 186/1736 [01:57<16:23,  1.58it/s]


Batch 186/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6641638817325715 
Training Accuracy per last 992 samples: 49.193548387096776


 14%|█▍        | 248/1736 [02:36<15:39,  1.58it/s]


Batch 248/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6668615956460276 
Training Accuracy per last 992 samples: 48.185483870967744


 18%|█▊        | 310/1736 [03:15<15:06,  1.57it/s]


Batch 310/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6688145668275894 
Training Accuracy per last 992 samples: 48.891129032258064


 21%|██▏       | 372/1736 [03:55<14:22,  1.58it/s]


Batch 372/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6854231203756025 
Training Accuracy per last 992 samples: 47.17741935483871


 25%|██▍       | 428/1736 [04:31<13:52,  1.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▌       | 434/1736 [04:34<13:47,  1.57it/s]


Batch 434/1736 of epoch 3 complete. Loss per last 992 samples:: 0.682131951855075 
Training Accuracy per last 992 samples: 47.58064516129032


 29%|██▊       | 496/1736 [05:14<13:04,  1.58it/s]


Batch 496/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6876486809022965 
Training Accuracy per last 992 samples: 47.07661290322581


 32%|███▏      | 558/1736 [05:53<12:26,  1.58it/s]


Batch 558/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6989043297306183 
Training Accuracy per last 992 samples: 44.354838709677416


 36%|███▌      | 620/1736 [06:32<11:46,  1.58it/s]


Batch 620/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6689984875340615 
Training Accuracy per last 992 samples: 48.48790322580645


 36%|███▌      | 624/1736 [06:35<11:47,  1.57it/s]

Epoch 3, batch 625 complete! Training Loss : 0.675451163482666
Epoch 3, batch 625 complete! Training Accuracy : 0.4777


100%|██████████| 174/174 [00:38<00:00,  4.55it/s]


Epoch 3, batch 625 complete! Validation Loss : 1.1724395519015434
Epoch 3, batch 625 complete! Validation Accuracy : 0.5521958243340532
Validation loss changed from 1.3173391428487053 to 1.1724395519015434
Best validation accuracy improved from 0.49388048956083513 to 0.5521958243340532



 36%|███▌      | 625/1736 [07:14<3:48:22, 12.33s/it]

The model has been saved in models/Thu Nov 25 02:22:11 2021_lr_2e-05_val_acc_0.5522_ep_3.pt


 39%|███▊      | 670/1736 [07:43<11:36,  1.53it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 39%|███▉      | 682/1736 [07:51<11:20,  1.55it/s]


Batch 682/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6605496022009081 
Training Accuracy per last 992 samples: 47.278225806451616


 43%|████▎     | 744/1736 [08:31<10:27,  1.58it/s]


Batch 744/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6772006942379859 
Training Accuracy per last 992 samples: 48.185483870967744


 46%|████▋     | 806/1736 [09:10<09:46,  1.59it/s]


Batch 806/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6913964363836473 
Training Accuracy per last 992 samples: 45.060483870967744


 50%|█████     | 868/1736 [09:49<09:07,  1.59it/s]


Batch 868/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6821748825811571 
Training Accuracy per last 992 samples: 47.681451612903224


 54%|█████▎    | 930/1736 [10:28<08:27,  1.59it/s]


Batch 930/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6853434962611045 
Training Accuracy per last 992 samples: 45.96774193548387


 56%|█████▌    | 976/1736 [10:56<07:58,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 57%|█████▋    | 992/1736 [11:06<07:48,  1.59it/s]


Batch 992/1736 of epoch 3 complete. Loss per last 992 samples:: 0.675171475256643 
Training Accuracy per last 992 samples: 45.766129032258064


 61%|██████    | 1054/1736 [11:47<07:11,  1.58it/s]


Batch 1054/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6856210923963978 
Training Accuracy per last 992 samples: 45.66532258064516


 64%|██████▍   | 1116/1736 [12:27<06:31,  1.58it/s]


Batch 1116/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6488228228784376 
Training Accuracy per last 992 samples: 49.29435483870968


 68%|██████▊   | 1178/1736 [13:06<05:52,  1.58it/s]


Batch 1178/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6858382071218183 
Training Accuracy per last 992 samples: 45.96774193548387


 71%|███████▏  | 1240/1736 [13:45<05:13,  1.58it/s]


Batch 1240/1736 of epoch 3 complete. Loss per last 992 samples:: 0.7008220764898485 
Training Accuracy per last 992 samples: 45.26209677419355


 72%|███████▏  | 1249/1736 [13:51<05:06,  1.59it/s]

Epoch 3, batch 1250 complete! Training Loss : 0.6775103603363037
Epoch 3, batch 1250 complete! Training Accuracy : 0.4718


100%|██████████| 174/174 [00:38<00:00,  4.51it/s]


Epoch 3, batch 1250 complete! Validation Loss : 1.0959558904856102
Epoch 3, batch 1250 complete! Validation Accuracy : 0.6069114470842333
Validation loss changed from 1.1724395519015434 to 1.0959558904856102
Best validation accuracy improved from 0.5521958243340532 to 0.6069114470842333



 72%|███████▏  | 1250/1736 [14:31<1:40:46, 12.44s/it]

The model has been saved in models/Thu Nov 25 02:29:28 2021_lr_2e-05_val_acc_0.6069_ep_3.pt


 75%|███████▌  | 1302/1736 [15:04<04:34,  1.58it/s]


Batch 1302/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6675734519958496 
Training Accuracy per last 992 samples: 46.774193548387096


 79%|███████▊  | 1364/1736 [15:43<03:55,  1.58it/s]


Batch 1364/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6623791263949487 
Training Accuracy per last 992 samples: 49.69758064516129


 82%|████████▏ | 1426/1736 [16:23<03:16,  1.58it/s]


Batch 1426/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6969256939426545 
Training Accuracy per last 992 samples: 45.86693548387097


 86%|████████▌ | 1488/1736 [17:02<02:36,  1.58it/s]


Batch 1488/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6636506588228287 
Training Accuracy per last 992 samples: 47.58064516129032


 89%|████████▉ | 1550/1736 [17:41<01:57,  1.58it/s]


Batch 1550/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6677527273854902 
Training Accuracy per last 992 samples: 46.471774193548384


 93%|█████████▎| 1612/1736 [18:20<01:19,  1.56it/s]


Batch 1612/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6687367346978956 
Training Accuracy per last 992 samples: 45.96774193548387


 96%|█████████▋| 1674/1736 [19:00<00:39,  1.58it/s]


Batch 1674/1736 of epoch 3 complete. Loss per last 992 samples:: 0.6845279585930609 
Training Accuracy per last 992 samples: 46.471774193548384


100%|█████████▉| 1735/1736 [19:38<00:00,  1.59it/s]


Batch 1736/1736 of epoch 3 complete. Loss per last 987 samples:: 0.667423673214451 
Training Accuracy per last 987 samples: 48.02431610942249
Epoch 3, batch 1736 complete! Training Loss : 0.6761971951767047
Epoch 3, batch 1736 complete! Training Accuracy : 0.4713550106225919


100%|██████████| 174/174 [00:38<00:00,  4.54it/s]


Epoch 3, batch 1736 complete! Validation Loss : 0.9934600447786266
Epoch 3, batch 1736 complete! Validation Accuracy : 0.6587473002159827
Validation loss changed from 1.0959558904856102 to 0.9934600447786266
Best validation accuracy improved from 0.6069114470842333 to 0.6587473002159827



100%|██████████| 1736/1736 [20:18<00:00,  1.43it/s]

The model has been saved in models/Thu Nov 25 02:35:15 2021_lr_2e-05_val_acc_0.6587_ep_3.pt





In [12]:
torch.save(net.state_dict(), 'models/orig_final_model.pt')

In [16]:
#net.load_state_dict(torch.load('models/orig_final_model.pt'))

<All keys matched successfully>

In [14]:
test_set = FriendsDataset(dataframe=df_test, tokenizer=tokenizer, max_length=MAX_LEN)
test_loader = DataLoader(test_set, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=1)
def predict(net, device, dataloader):
    net.eval()
    predictions = []

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            max_logits, argmax_idx = torch.max(logits.data, dim=1)
            predictions.extend(argmax_idx.tolist())
    del logits
    return predictions
preds = predict(net, device, test_loader)

100%|██████████| 193/193 [00:46<00:00,  4.18it/s]


In [15]:
df_test

Unnamed: 0.1,Unnamed: 0,Id,other_speaker,friend_response
0,0,0,"But I'm unemployed, my music is all I really h...","All right, I'm gonna do it! I'm gonna get shot..."
1,1,1,Check this out. Five hundred and seventeen boxes!,"Oh my God, how did you do that?"
2,2,2,"Okay. Okay. Would, would it help if I went ove...",Yeah! Yeah! That would be very helpful! Yeah.
3,3,3,"Ross, what is taking you so long?","I'm sorry, it's almost as if this wasn't built..."
4,4,4,So who are you?,"Well, our names really are Monica and Chandler..."
...,...,...,...,...
3081,3081,3081,It's not just the drum noise. Every five minut...,"Yes, thank you. You see, this is how normal pe..."
3082,3082,3082,I think I accidentally used Monica’s boxes to ...,Oh no. Dad! Dad! What. Oh God everything’s rui...
3083,3083,3083,"so y'know, that’s why, within a few years, tha...","Oh, this is so great."
3084,3084,3084,He slept with you and then never called you.,And I just wanted a new daddy for Davy and Becky.


In [35]:
df_test

Unnamed: 0,Id,other_speaker,friend_response
0,0,"Но я безработный, моя музыка - это все, что у ...",Меня застрелят. Любой совет?
1,1,Посмотри. Пятьсот семнадцать коробок!,"Боже мой, как ты это сделал?"
2,2,"Хорошо. Хорошо. Помогло бы, если бы я подошел ...",Это было бы очень полезно!
3,3,"Росс, чего ты так долго?","Простите, это как будто не для быстрого отдыха!"
4,4,Так кто ты?,"Ну, наши имена действительно Моника и Чендлер...."
...,...,...,...
3081,3081,Дело не только в барабанах. Каждые пять минут ...,"Понимаете, именно так нормальные люди должны р..."
3082,3082,"Кажется, я случайно использовал коробки Моники...","Боже, все испорчено! Папа, она будет раздавлена!"
3083,3083,"ну знаете, вот почему через несколько лет расп...","Ой, это так здорово."
3084,3084,"Он переспал с тобой, а потом никогда тебе не з...",А я просто хотела нового папу для Дэви и Бекки.


In [16]:
answers = pd.DataFrame(
    names_to_cats.inverse_transform(preds), 
    index=df_test.Id, columns=["Category"])
answers.to_csv('submission1_orig.csv')
answers

Unnamed: 0_level_0,Category
Id,Unnamed: 1_level_1
0,ДЖОУИ
1,МОНИКА
2,РЕЙЧЕЛ
3,РОСС
4,МОНИКА
...,...
3081,ЧЕНДЛЕР
3082,РОСС
3083,РЕЙЧЕЛ
3084,ЧЕНДЛЕР


In [38]:
answers = pd.DataFrame(
    names_to_cats.inverse_transform(preds), 
    index=df_test.Id, columns=["Category"])
answers.to_csv('submission1_orig.csv')
answers

Unnamed: 0_level_0,Category
Id,Unnamed: 1_level_1
0,ФИБИ
1,МОНИКА
2,РЕЙЧЕЛ
3,РОСС
4,ЧЕНДЛЕР
...,...
3081,ФИБИ
3082,МОНИКА
3083,РЕЙЧЕЛ
3084,МОНИКА


In [None]:
from transformers import get_constant_schedule
opti = AdamW(net.parameters(), lr=LEARNING_RATE, weight_decay=1e-2)
lr_scheduler = get_constant_schedule(optimizer=opti)
train_set = FriendsDataset(dataframe=df_train.iloc[5000:], tokenizer=tokenizer, max_length=MAX_LEN)

val_set = FriendsDataset(dataframe=df_val, tokenizer=tokenizer, max_length=MAX_LEN)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=2)

train_bert(net, criterion, opti, LEARNING_RATE, lr_scheduler, train_loader, val_loader, EPOCHS, iters_to_accumulate)

In [22]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [22]:
torch.cuda.get_device_properties(0).total_memory / 1e6

8506.769408

In [12]:
# Training perfomance during usual train-eval on hold-out loop
train_bert(net, criterion, opti, LEARNING_RATE, lr_scheduler, train_loader, val_loader, EPOCHS, iters_to_accumulate)

  4%|▍         | 62/1563 [00:43<17:32,  1.43it/s]


Batch 62/1563 of epoch 1 complete. Loss per last 992 samples:: 0.913238279281124 
Training Accuracy per last 992 samples: 19.556451612903224


  8%|▊         | 124/1563 [01:25<16:41,  1.44it/s]


Batch 124/1563 of epoch 1 complete. Loss per last 992 samples:: 0.9004580590032762 
Training Accuracy per last 992 samples: 17.641129032258064


 12%|█▏        | 186/1563 [02:09<16:10,  1.42it/s]


Batch 186/1563 of epoch 1 complete. Loss per last 992 samples:: 0.899505615234375 
Training Accuracy per last 992 samples: 17.338709677419356


 16%|█▌        | 248/1563 [02:51<15:13,  1.44it/s]


Batch 248/1563 of epoch 1 complete. Loss per last 992 samples:: 0.9008006434286794 
Training Accuracy per last 992 samples: 18.75


 16%|█▋        | 256/1563 [02:57<14:23,  1.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 19%|█▉        | 301/1563 [03:25<13:10,  1.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 20%|█▉        | 310/1563 [03:31<13:11,  1.58it/s]


Batch 310/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8999879898563508 
Training Accuracy per last 992 samples: 16.93548387096774


 24%|██▍       | 372/1563 [04:10<12:32,  1.58it/s]


Batch 372/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8955442367061492 
Training Accuracy per last 992 samples: 19.35483870967742


 28%|██▊       | 434/1563 [04:49<11:53,  1.58it/s]


Batch 434/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8951455393145161 
Training Accuracy per last 992 samples: 18.548387096774192


 32%|███▏      | 496/1563 [05:28<11:12,  1.59it/s]


Batch 496/1563 of epoch 1 complete. Loss per last 992 samples:: 0.895750476467994 
Training Accuracy per last 992 samples: 19.455645161290324


 36%|███▌      | 558/1563 [06:07<10:34,  1.58it/s]


Batch 558/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8903459118258569 
Training Accuracy per last 992 samples: 20.866935483870968


 40%|███▉      | 620/1563 [06:46<09:55,  1.58it/s]


Batch 620/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8890636813256049 
Training Accuracy per last 992 samples: 21.370967741935484


 40%|███▉      | 624/1563 [06:48<09:54,  1.58it/s]

Epoch 1, batch 625 complete! Training Loss : 0.8979001708984375
Epoch 1, batch 625 complete! Training Accuracy : 0.1898


100%|██████████| 174/174 [00:38<00:00,  4.56it/s]


Epoch 1, batch 625 complete! Validation Loss : 1.7738734456314438
Epoch 1, batch 625 complete! Validation Accuracy : 0.21526277897768178
Validation loss changed from inf to 1.7738734456314438
Best validation accuracy improved from 0 to 0.21526277897768178



 40%|███▉      | 625/1563 [07:28<3:12:19, 12.30s/it]

The model has been saved in models/Wed Nov 24 23:29:33 2021_lr_2e-05_val_acc_0.2153_ep_1.pt


 44%|████▎     | 682/1563 [08:04<09:16,  1.58it/s]


Batch 682/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8921690910093246 
Training Accuracy per last 992 samples: 20.766129032258064


 48%|████▊     | 744/1563 [08:43<08:37,  1.58it/s]


Batch 744/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8860958468529486 
Training Accuracy per last 992 samples: 21.975806451612904


 52%|█████▏    | 806/1563 [09:22<07:58,  1.58it/s]


Batch 806/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8756868916173135 
Training Accuracy per last 992 samples: 23.68951612903226


 56%|█████▌    | 868/1563 [10:01<07:31,  1.54it/s]


Batch 868/1563 of epoch 1 complete. Loss per last 992 samples:: 0.873778066327495 
Training Accuracy per last 992 samples: 25.302419354838708


 60%|█████▉    | 930/1563 [10:42<06:41,  1.58it/s]


Batch 930/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8720855712890625 
Training Accuracy per last 992 samples: 25.100806451612904


 63%|██████▎   | 992/1563 [11:22<06:00,  1.58it/s]


Batch 992/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8710652012978831 
Training Accuracy per last 992 samples: 23.487903225806452


 67%|██████▋   | 1054/1563 [12:01<05:22,  1.58it/s]


Batch 1054/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8707061275359123 
Training Accuracy per last 992 samples: 24.495967741935484


 71%|███████▏  | 1116/1563 [12:40<04:42,  1.58it/s]


Batch 1116/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8523943501134073 
Training Accuracy per last 992 samples: 29.33467741935484


 75%|███████▌  | 1178/1563 [13:19<04:03,  1.58it/s]


Batch 1178/1563 of epoch 1 complete. Loss per last 992 samples:: 0.86222900882844 
Training Accuracy per last 992 samples: 26.108870967741936


 79%|███████▉  | 1240/1563 [13:58<03:24,  1.58it/s]


Batch 1240/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8570504957629789 
Training Accuracy per last 992 samples: 26.108870967741936


 80%|███████▉  | 1249/1563 [14:04<03:17,  1.59it/s]

Epoch 1, batch 1250 complete! Training Loss : 0.8843230895996094
Epoch 1, batch 1250 complete! Training Accuracy : 0.2186


100%|██████████| 174/174 [00:38<00:00,  4.56it/s]


Epoch 1, batch 1250 complete! Validation Loss : 1.6809274958468032
Epoch 1, batch 1250 complete! Validation Accuracy : 0.30417566594672424
Validation loss changed from 1.7738734456314438 to 1.6809274958468032
Best validation accuracy improved from 0.21526277897768178 to 0.30417566594672424



 80%|███████▉  | 1250/1563 [14:43<1:04:10, 12.30s/it]

The model has been saved in models/Wed Nov 24 23:36:48 2021_lr_2e-05_val_acc_0.3042_ep_1.pt


 83%|████████▎ | 1290/1563 [15:09<02:53,  1.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 83%|████████▎ | 1302/1563 [15:16<02:45,  1.58it/s]


Batch 1302/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8526414440524194 
Training Accuracy per last 992 samples: 28.225806451612904


 85%|████████▌ | 1333/1563 [15:36<02:24,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 87%|████████▋ | 1364/1563 [15:55<02:06,  1.58it/s]


Batch 1364/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8346786499023438 
Training Accuracy per last 992 samples: 29.838709677419356


 91%|█████████ | 1426/1563 [16:35<01:26,  1.58it/s]


Batch 1426/1563 of epoch 1 complete. Loss per last 992 samples:: 0.8506345441264491 
Training Accuracy per last 992 samples: 26.91532258064516


 95%|█████████▌| 1488/1563 [17:14<00:47,  1.58it/s]


Batch 1488/1563 of epoch 1 complete. Loss per last 992 samples:: 0.847123299875567 
Training Accuracy per last 992 samples: 26.20967741935484


 99%|█████████▉| 1550/1563 [17:53<00:08,  1.58it/s]


Batch 1550/1563 of epoch 1 complete. Loss per last 992 samples:: 0.813750482374622 
Training Accuracy per last 992 samples: 32.358870967741936


100%|█████████▉| 1562/1563 [18:00<00:00,  1.58it/s]

Epoch 1, batch 1563 complete! Training Loss : 0.8752698355276312
Epoch 1, batch 1563 complete! Training Accuracy : 0.2324250790221262


100%|██████████| 174/174 [00:38<00:00,  4.55it/s]


Epoch 1, batch 1563 complete! Validation Loss : 1.6280372005769577
Epoch 1, batch 1563 complete! Validation Accuracy : 0.3369330453563715
Validation loss changed from 1.6809274958468032 to 1.6280372005769577
Best validation accuracy improved from 0.30417566594672424 to 0.3369330453563715



100%|██████████| 1563/1563 [18:39<00:00,  1.40it/s]


The model has been saved in models/Wed Nov 24 23:40:44 2021_lr_2e-05_val_acc_0.3369_ep_1.pt


  4%|▍         | 62/1563 [00:39<15:49,  1.58it/s]


Batch 62/1563 of epoch 2 complete. Loss per last 992 samples:: 0.8078534526209677 
Training Accuracy per last 992 samples: 33.971774193548384


  8%|▊         | 124/1563 [01:18<15:10,  1.58it/s]


Batch 124/1563 of epoch 2 complete. Loss per last 992 samples:: 0.8096378695580267 
Training Accuracy per last 992 samples: 34.17338709677419


  9%|▉         | 138/1563 [01:27<14:59,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|█▏        | 186/1563 [01:57<14:32,  1.58it/s]


Batch 186/1563 of epoch 2 complete. Loss per last 992 samples:: 0.8061800310688634 
Training Accuracy per last 992 samples: 34.778225806451616


 13%|█▎        | 201/1563 [02:07<14:18,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 248/1563 [02:36<13:53,  1.58it/s]


Batch 248/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7881897957094254 
Training Accuracy per last 992 samples: 35.181451612903224


 20%|█▉        | 310/1563 [03:15<13:11,  1.58it/s]


Batch 310/1563 of epoch 2 complete. Loss per last 992 samples:: 0.8017726713611234 
Training Accuracy per last 992 samples: 33.064516129032256


 24%|██▍       | 372/1563 [03:54<12:33,  1.58it/s]


Batch 372/1563 of epoch 2 complete. Loss per last 992 samples:: 0.8028790873865927 
Training Accuracy per last 992 samples: 33.770161290322584


 28%|██▊       | 434/1563 [04:34<11:51,  1.59it/s]


Batch 434/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7818398014191659 
Training Accuracy per last 992 samples: 35.98790322580645


 32%|███▏      | 496/1563 [05:13<11:20,  1.57it/s]


Batch 496/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7809099689606698 
Training Accuracy per last 992 samples: 35.483870967741936


 36%|███▌      | 558/1563 [05:52<10:34,  1.58it/s]


Batch 558/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7891100606610698 
Training Accuracy per last 992 samples: 34.67741935483871


 40%|███▉      | 620/1563 [06:31<09:57,  1.58it/s]


Batch 620/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7921706168882309 
Training Accuracy per last 992 samples: 37.903225806451616


 40%|███▉      | 624/1563 [06:34<09:59,  1.57it/s]

Epoch 2, batch 625 complete! Training Loss : 0.7959734130859375
Epoch 2, batch 625 complete! Training Accuracy : 0.3487


100%|██████████| 174/174 [00:38<00:00,  4.55it/s]


Epoch 2, batch 625 complete! Validation Loss : 1.57643024400733
Epoch 2, batch 625 complete! Validation Accuracy : 0.3545716342692585
Validation loss changed from 1.6280372005769577 to 1.57643024400733
Best validation accuracy improved from 0.3369330453563715 to 0.3545716342692585



 40%|███▉      | 625/1563 [07:14<3:12:36, 12.32s/it]

The model has been saved in models/Wed Nov 24 23:47:58 2021_lr_2e-05_val_acc_0.3546_ep_2.pt


 44%|████▎     | 682/1563 [07:50<09:18,  1.58it/s]


Batch 682/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7881764750326833 
Training Accuracy per last 992 samples: 35.78629032258065


 48%|████▊     | 744/1563 [08:29<08:36,  1.58it/s]


Batch 744/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7537336041850429 
Training Accuracy per last 992 samples: 40.725806451612904


 49%|████▉     | 765/1563 [08:42<08:22,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 52%|█████▏    | 806/1563 [09:08<08:00,  1.57it/s]


Batch 806/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7786685574439264 
Training Accuracy per last 992 samples: 35.38306451612903


 56%|█████▌    | 868/1563 [09:47<07:19,  1.58it/s]


Batch 868/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7805727066532258 
Training Accuracy per last 992 samples: 36.99596774193548


 60%|█████▉    | 930/1563 [10:26<06:40,  1.58it/s]


Batch 930/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7670226250925372 
Training Accuracy per last 992 samples: 39.516129032258064


 63%|██████▎   | 992/1563 [11:05<06:00,  1.58it/s]


Batch 992/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7665022265526557 
Training Accuracy per last 992 samples: 38.40725806451613


 67%|██████▋   | 1054/1563 [11:44<05:22,  1.58it/s]


Batch 1054/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7538205731299615 
Training Accuracy per last 992 samples: 39.314516129032256


 71%|███████▏  | 1116/1563 [12:24<04:42,  1.58it/s]


Batch 1116/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7677207454558341 
Training Accuracy per last 992 samples: 38.104838709677416


 75%|███████▌  | 1178/1563 [13:03<04:03,  1.58it/s]


Batch 1178/1563 of epoch 2 complete. Loss per last 992 samples:: 0.779358156265751 
Training Accuracy per last 992 samples: 36.49193548387097


 79%|███████▉  | 1240/1563 [13:42<03:24,  1.58it/s]


Batch 1240/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7556150344110304 
Training Accuracy per last 992 samples: 39.01209677419355


 80%|███████▉  | 1249/1563 [13:47<03:18,  1.59it/s]

Epoch 2, batch 1250 complete! Training Loss : 0.7824455329895019
Epoch 2, batch 1250 complete! Training Accuracy : 0.36435


100%|██████████| 174/174 [00:38<00:00,  4.55it/s]


Epoch 2, batch 1250 complete! Validation Loss : 1.5400352217685218
Epoch 2, batch 1250 complete! Validation Accuracy : 0.36285097192224625
Validation loss changed from 1.57643024400733 to 1.5400352217685218
Best validation accuracy improved from 0.3545716342692585 to 0.36285097192224625



 80%|███████▉  | 1250/1563 [14:27<1:04:18, 12.33s/it]

The model has been saved in models/Wed Nov 24 23:55:12 2021_lr_2e-05_val_acc_0.3629_ep_2.pt


 83%|████████▎ | 1302/1563 [15:00<02:45,  1.58it/s]


Batch 1302/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7614023454727665 
Training Accuracy per last 992 samples: 40.221774193548384


 87%|████████▋ | 1364/1563 [15:39<02:05,  1.58it/s]


Batch 1364/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7488531297253024 
Training Accuracy per last 992 samples: 40.42338709677419


 91%|█████████ | 1426/1563 [16:18<01:26,  1.58it/s]


Batch 1426/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7741169775685957 
Training Accuracy per last 992 samples: 37.19758064516129


 93%|█████████▎| 1460/1563 [16:40<01:04,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 95%|█████████▌| 1488/1563 [16:57<00:47,  1.58it/s]


Batch 1488/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7606095652426442 
Training Accuracy per last 992 samples: 38.306451612903224


 99%|█████████▉| 1550/1563 [17:37<00:08,  1.58it/s]


Batch 1550/1563 of epoch 2 complete. Loss per last 992 samples:: 0.7647923346488706 
Training Accuracy per last 992 samples: 37.80241935483871


100%|█████████▉| 1562/1563 [17:44<00:00,  1.58it/s]

Epoch 2, batch 1563 complete! Training Loss : 0.7780847128430621
Epoch 2, batch 1563 complete! Training Accuracy : 0.36938342735966073


100%|██████████| 174/174 [00:38<00:00,  4.55it/s]


Epoch 2, batch 1563 complete! Validation Loss : 1.5354859431584675
Epoch 2, batch 1563 complete! Validation Accuracy : 0.37329013678905687
Validation loss changed from 1.5400352217685218 to 1.5354859431584675
Best validation accuracy improved from 0.36285097192224625 to 0.37329013678905687



100%|██████████| 1563/1563 [18:23<00:00,  1.42it/s]


The model has been saved in models/Wed Nov 24 23:59:08 2021_lr_2e-05_val_acc_0.3733_ep_2.pt


  4%|▍         | 62/1563 [00:39<15:48,  1.58it/s]


Batch 62/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6825685039643319 
Training Accuracy per last 992 samples: 45.46370967741935


  8%|▊         | 124/1563 [01:18<15:09,  1.58it/s]


Batch 124/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6759223937988281 
Training Accuracy per last 992 samples: 46.37096774193548


 12%|█▏        | 186/1563 [01:57<14:29,  1.58it/s]


Batch 186/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6751911563258017 
Training Accuracy per last 992 samples: 48.79032258064516


 16%|█▌        | 248/1563 [02:36<13:51,  1.58it/s]


Batch 248/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6516752089223554 
Training Accuracy per last 992 samples: 51.310483870967744


 20%|█▉        | 310/1563 [03:15<13:11,  1.58it/s]


Batch 310/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6781766491551553 
Training Accuracy per last 992 samples: 47.58064516129032


 24%|██▍       | 372/1563 [03:54<12:33,  1.58it/s]


Batch 372/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6727600405293126 
Training Accuracy per last 992 samples: 47.681451612903224


 27%|██▋       | 427/1563 [04:29<11:55,  1.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 28%|██▊       | 434/1563 [04:33<11:56,  1.58it/s]


Batch 434/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6803347218421197 
Training Accuracy per last 992 samples: 47.479838709677416


 32%|███▏      | 496/1563 [05:13<11:16,  1.58it/s]


Batch 496/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6703915288371425 
Training Accuracy per last 992 samples: 48.79032258064516


 36%|███▌      | 558/1563 [05:52<10:35,  1.58it/s]


Batch 558/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6720982674629458 
Training Accuracy per last 992 samples: 46.67338709677419


 40%|███▉      | 620/1563 [06:31<09:55,  1.58it/s]


Batch 620/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6618139205440399 
Training Accuracy per last 992 samples: 48.58870967741935


 40%|███▉      | 624/1563 [06:33<09:55,  1.58it/s]

Epoch 3, batch 625 complete! Training Loss : 0.6721667770385742
Epoch 3, batch 625 complete! Training Accuracy : 0.4783


100%|██████████| 174/174 [00:38<00:00,  4.56it/s]


Epoch 3, batch 625 complete! Validation Loss : 1.5608694676695198
Epoch 3, batch 625 complete! Validation Accuracy : 0.3804895608351332
Validation loss changed from 1.5354859431584675 to 1.5608694676695198
Best validation accuracy improved from 0.37329013678905687 to 0.3804895608351332



 40%|███▉      | 625/1563 [07:13<3:12:44, 12.33s/it]

The model has been saved in models/Thu Nov 25 00:06:21 2021_lr_2e-05_val_acc_0.3805_ep_3.pt


 44%|████▎     | 682/1563 [07:49<09:16,  1.58it/s]


Batch 682/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6685508989518688 
Training Accuracy per last 992 samples: 48.185483870967744


 48%|████▊     | 744/1563 [08:28<08:38,  1.58it/s]


Batch 744/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6534895896911621 
Training Accuracy per last 992 samples: 47.983870967741936


 52%|█████▏    | 806/1563 [09:07<07:58,  1.58it/s]


Batch 806/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6842802416893744 
Training Accuracy per last 992 samples: 46.471774193548384


 56%|█████▌    | 868/1563 [09:47<07:20,  1.58it/s]


Batch 868/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6765977259605161 
Training Accuracy per last 992 samples: 45.46370967741935


 60%|█████▉    | 930/1563 [10:26<06:42,  1.57it/s]


Batch 930/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6835372217239872 
Training Accuracy per last 992 samples: 45.766129032258064


 60%|█████▉    | 931/1563 [10:26<06:41,  1.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 63%|██████▎   | 992/1563 [11:05<06:02,  1.57it/s]


Batch 992/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6742876498929916 
Training Accuracy per last 992 samples: 48.99193548387097


 67%|██████▋   | 1054/1563 [11:45<05:23,  1.57it/s]


Batch 1054/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6710429806863109 
Training Accuracy per last 992 samples: 46.37096774193548


 71%|███████▏  | 1116/1563 [12:24<04:47,  1.55it/s]


Batch 1116/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6879153866921702 
Training Accuracy per last 992 samples: 47.07661290322581


 75%|███████▌  | 1178/1563 [13:04<04:10,  1.54it/s]


Batch 1178/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6649544392862627 
Training Accuracy per last 992 samples: 50.403225806451616


 77%|███████▋  | 1203/1563 [13:20<03:48,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 79%|███████▉  | 1240/1563 [13:43<03:25,  1.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Batch 1240/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6748255837348199 
Training Accuracy per last 992 samples: 47.78225806451613


 80%|███████▉  | 1249/1563 [13:49<03:19,  1.58it/s]

Epoch 3, batch 1250 complete! Training Loss : 0.6733337162017822
Epoch 3, batch 1250 complete! Training Accuracy : 0.47655


100%|██████████| 174/174 [00:38<00:00,  4.52it/s]
 80%|███████▉  | 1250/1563 [14:28<1:03:37, 12.20s/it]

Epoch 3, batch 1250 complete! Validation Loss : 1.5664778817659137
Epoch 3, batch 1250 complete! Validation Accuracy : 0.3696904247660187


 83%|████████▎ | 1302/1563 [15:01<02:46,  1.57it/s]


Batch 1302/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6740102921762774 
Training Accuracy per last 992 samples: 47.479838709677416


 87%|████████▋ | 1364/1563 [15:41<02:06,  1.57it/s]


Batch 1364/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6539135440703361 
Training Accuracy per last 992 samples: 49.596774193548384


 91%|█████████ | 1426/1563 [16:20<01:26,  1.58it/s]


Batch 1426/1563 of epoch 3 complete. Loss per last 992 samples:: 0.6623226288826235 
Training Accuracy per last 992 samples: 49.193548387096776


 95%|█████████▌| 1488/1563 [16:59<00:47,  1.59it/s]


Batch 1488/1563 of epoch 3 complete. Loss per last 992 samples:: 0.682469775599818 
Training Accuracy per last 992 samples: 46.068548387096776


 99%|█████████▉| 1550/1563 [17:39<00:08,  1.58it/s]


Batch 1550/1563 of epoch 3 complete. Loss per last 992 samples:: 0.695514602045859 
Training Accuracy per last 992 samples: 43.44758064516129


100%|█████████▉| 1562/1563 [17:46<00:00,  1.58it/s]

Epoch 3, batch 1563 complete! Training Loss : 0.6729513828524091
Epoch 3, batch 1563 complete! Training Accuracy : 0.4755331492817989


100%|██████████| 174/174 [00:38<00:00,  4.54it/s]


Epoch 3, batch 1563 complete! Validation Loss : 1.5438008157686256
Epoch 3, batch 1563 complete! Validation Accuracy : 0.3930885529157667
Validation loss changed from 1.5608694676695198 to 1.5438008157686256
Best validation accuracy improved from 0.3804895608351332 to 0.3930885529157667



100%|██████████| 1563/1563 [18:25<00:00,  1.41it/s]


The model has been saved in models/Thu Nov 25 00:17:34 2021_lr_2e-05_val_acc_0.3931_ep_3.pt


  0%|          | 1/1563 [00:00<18:52,  1.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  3%|▎         | 45/1563 [00:28<16:24,  1.54it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  4%|▍         | 62/1563 [00:39<16:00,  1.56it/s]


Batch 62/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5344237204520933 
Training Accuracy per last 992 samples: 61.08870967741935


  8%|▊         | 124/1563 [01:18<15:16,  1.57it/s]


Batch 124/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5134004008385443 
Training Accuracy per last 992 samples: 60.98790322580645


 12%|█▏        | 186/1563 [01:58<14:38,  1.57it/s]


Batch 186/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5185778448658604 
Training Accuracy per last 992 samples: 62.29838709677419


 16%|█▌        | 248/1563 [02:37<13:57,  1.57it/s]


Batch 248/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5098814195202243 
Training Accuracy per last 992 samples: 61.59274193548387


 17%|█▋        | 263/1563 [02:47<13:43,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 20%|█▉        | 310/1563 [03:17<13:18,  1.57it/s]


Batch 310/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5058845935329315 
Training Accuracy per last 992 samples: 61.391129032258064


 24%|██▍       | 372/1563 [03:57<12:40,  1.57it/s]


Batch 372/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5206207229245093 
Training Accuracy per last 992 samples: 61.391129032258064


 28%|██▊       | 434/1563 [04:36<12:01,  1.56it/s]


Batch 434/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5093132757371471 
Training Accuracy per last 992 samples: 62.70161290322581


 32%|███▏      | 496/1563 [05:15<11:19,  1.57it/s]


Batch 496/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5088711323276642 
Training Accuracy per last 992 samples: 62.399193548387096


 36%|███▌      | 558/1563 [05:55<10:39,  1.57it/s]


Batch 558/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5183439639306837 
Training Accuracy per last 992 samples: 60.38306451612903


 40%|███▉      | 620/1563 [06:34<10:01,  1.57it/s]


Batch 620/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5141992415151289 
Training Accuracy per last 992 samples: 61.99596774193548


 40%|███▉      | 624/1563 [06:37<10:03,  1.56it/s]

Epoch 4, batch 625 complete! Training Loss : 0.5151514812469482
Epoch 4, batch 625 complete! Training Accuracy : 0.6167


100%|██████████| 174/174 [00:38<00:00,  4.48it/s]
 40%|███▉      | 625/1563 [07:16<3:12:00, 12.28s/it]

Epoch 4, batch 625 complete! Validation Loss : 1.7097248928300266
Epoch 4, batch 625 complete! Validation Accuracy : 0.37688984881209503


 44%|████▎     | 682/1563 [07:53<09:20,  1.57it/s]


Batch 682/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5164726357306203 
Training Accuracy per last 992 samples: 61.29032258064516


 45%|████▍     | 702/1563 [08:06<09:05,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 48%|████▊     | 744/1563 [08:32<08:38,  1.58it/s]


Batch 744/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5200393469102921 
Training Accuracy per last 992 samples: 62.19758064516129


 52%|█████▏    | 806/1563 [09:11<07:59,  1.58it/s]


Batch 806/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5438617044879545 
Training Accuracy per last 992 samples: 59.475806451612904


 56%|█████▌    | 868/1563 [09:51<07:22,  1.57it/s]


Batch 868/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5232942219703428 
Training Accuracy per last 992 samples: 60.58467741935484


 60%|█████▉    | 930/1563 [10:30<06:43,  1.57it/s]


Batch 930/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5032825316152265 
Training Accuracy per last 992 samples: 61.99596774193548


 63%|██████▎   | 992/1563 [11:09<06:03,  1.57it/s]


Batch 992/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5462102543923163 
Training Accuracy per last 992 samples: 58.770161290322584


 67%|██████▋   | 1054/1563 [11:49<05:24,  1.57it/s]


Batch 1054/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5545949243730114 
Training Accuracy per last 992 samples: 58.568548387096776


 71%|███████▏  | 1116/1563 [12:28<04:44,  1.57it/s]


Batch 1116/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5180700171378351 
Training Accuracy per last 992 samples: 61.59274193548387


 75%|███████▌  | 1178/1563 [13:08<04:07,  1.56it/s]


Batch 1178/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5270636812333138 
Training Accuracy per last 992 samples: 59.778225806451616


 79%|███████▉  | 1240/1563 [13:47<03:25,  1.57it/s]


Batch 1240/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5302376631767519 
Training Accuracy per last 992 samples: 59.07258064516129


 80%|███████▉  | 1249/1563 [13:53<03:18,  1.58it/s]

Epoch 4, batch 1250 complete! Training Loss : 0.5215600643157959
Epoch 4, batch 1250 complete! Training Accuracy : 0.61


100%|██████████| 174/174 [00:38<00:00,  4.52it/s]
 80%|███████▉  | 1250/1563 [14:32<1:03:36, 12.19s/it]

Epoch 4, batch 1250 complete! Validation Loss : 1.70805256257112
Epoch 4, batch 1250 complete! Validation Accuracy : 0.3804895608351332


 83%|████████▎ | 1302/1563 [15:05<02:46,  1.57it/s]


Batch 1302/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5342865951599614 
Training Accuracy per last 992 samples: 61.391129032258064


 87%|████████▋ | 1364/1563 [15:45<02:06,  1.57it/s]


Batch 1364/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5321467076578448 
Training Accuracy per last 992 samples: 59.67741935483871


 91%|█████████ | 1426/1563 [16:24<01:27,  1.57it/s]


Batch 1426/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5034539141962605 
Training Accuracy per last 992 samples: 61.79435483870968


 95%|█████████▌| 1488/1563 [17:03<00:47,  1.57it/s]


Batch 1488/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5417321055166183 
Training Accuracy per last 992 samples: 60.483870967741936


 99%|█████████▉| 1550/1563 [17:43<00:08,  1.56it/s]


Batch 1550/1563 of epoch 4 complete. Loss per last 992 samples:: 0.5453714843719236 
Training Accuracy per last 992 samples: 58.66935483870968


100%|█████████▉| 1562/1563 [17:50<00:00,  1.57it/s]

Epoch 4, batch 1563 complete! Training Loss : 0.5236004481160023
Epoch 4, batch 1563 complete! Training Accuracy : 0.60869043332133


100%|██████████| 174/174 [00:38<00:00,  4.52it/s]
100%|██████████| 1563/1563 [18:29<00:00,  1.41it/s]


Epoch 4, batch 1563 complete! Validation Loss : 1.7014721782728173
Epoch 4, batch 1563 complete! Validation Accuracy : 0.3804895608351332


  4%|▍         | 62/1563 [00:40<15:57,  1.57it/s]


Batch 62/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3376353036972784 
Training Accuracy per last 992 samples: 77.31854838709677


  8%|▊         | 124/1563 [01:19<15:14,  1.57it/s]


Batch 124/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3257069472343691 
Training Accuracy per last 992 samples: 77.31854838709677


 12%|█▏        | 186/1563 [01:58<14:33,  1.58it/s]


Batch 186/1563 of epoch 5 complete. Loss per last 992 samples:: 0.36724968302634453 
Training Accuracy per last 992 samples: 73.79032258064517


 15%|█▍        | 231/1563 [02:27<14:04,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 248/1563 [02:38<13:55,  1.57it/s]


Batch 248/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3524282017061787 
Training Accuracy per last 992 samples: 74.8991935483871


 20%|█▉        | 310/1563 [03:17<13:20,  1.57it/s]


Batch 310/1563 of epoch 5 complete. Loss per last 992 samples:: 0.33469848382857537 
Training Accuracy per last 992 samples: 78.125


 24%|██▍       | 372/1563 [03:56<12:32,  1.58it/s]


Batch 372/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3605522955617597 
Training Accuracy per last 992 samples: 74.79838709677419


 28%|██▊       | 434/1563 [04:36<11:55,  1.58it/s]


Batch 434/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3437589003193763 
Training Accuracy per last 992 samples: 75.80645161290323


 32%|███▏      | 496/1563 [05:15<11:15,  1.58it/s]


Batch 496/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3379732899127468 
Training Accuracy per last 992 samples: 74.49596774193549


 36%|███▌      | 558/1563 [05:54<10:40,  1.57it/s]


Batch 558/1563 of epoch 5 complete. Loss per last 992 samples:: 0.34491656672570015 
Training Accuracy per last 992 samples: 74.49596774193549


 40%|███▉      | 620/1563 [06:33<10:00,  1.57it/s]


Batch 620/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3490165404735073 
Training Accuracy per last 992 samples: 75.60483870967742


 40%|███▉      | 624/1563 [06:36<10:00,  1.56it/s]

Epoch 5, batch 625 complete! Training Loss : 0.34588309478759766
Epoch 5, batch 625 complete! Training Accuracy : 0.7561


100%|██████████| 174/174 [00:38<00:00,  4.52it/s]
 40%|███▉      | 625/1563 [07:15<3:10:26, 12.18s/it]

Epoch 5, batch 625 complete! Validation Loss : 1.9962554471246128
Epoch 5, batch 625 complete! Validation Accuracy : 0.37329013678905687


 44%|████▎     | 682/1563 [07:51<09:21,  1.57it/s]


Batch 682/1563 of epoch 5 complete. Loss per last 992 samples:: 0.36432146929925485 
Training Accuracy per last 992 samples: 73.58870967741936


 48%|████▊     | 744/1563 [08:31<08:41,  1.57it/s]


Batch 744/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3534245010345213 
Training Accuracy per last 992 samples: 73.99193548387096


 52%|█████▏    | 806/1563 [09:10<08:02,  1.57it/s]


Batch 806/1563 of epoch 5 complete. Loss per last 992 samples:: 0.36217169800112325 
Training Accuracy per last 992 samples: 73.58870967741936


 56%|█████▌    | 868/1563 [09:50<07:22,  1.57it/s]


Batch 868/1563 of epoch 5 complete. Loss per last 992 samples:: 0.33570841050917105 
Training Accuracy per last 992 samples: 74.6975806451613


 60%|█████▉    | 930/1563 [10:29<06:43,  1.57it/s]


Batch 930/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3696897077944971 
Training Accuracy per last 992 samples: 72.68145161290323


 63%|██████▎   | 992/1563 [11:08<06:04,  1.57it/s]


Batch 992/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3597254954999493 
Training Accuracy per last 992 samples: 74.59677419354838


 67%|██████▋   | 1054/1563 [11:48<05:23,  1.57it/s]


Batch 1054/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3668857514858246 
Training Accuracy per last 992 samples: 74.19354838709677


 71%|███████▏  | 1114/1563 [12:26<04:45,  1.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 71%|███████▏  | 1116/1563 [12:27<04:47,  1.55it/s]


Batch 1116/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3576075598116844 
Training Accuracy per last 992 samples: 73.99193548387096


 73%|███████▎  | 1141/1563 [12:43<04:27,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 75%|███████▌  | 1178/1563 [13:07<04:05,  1.57it/s]


Batch 1178/1563 of epoch 5 complete. Loss per last 992 samples:: 0.35787985113359266 
Training Accuracy per last 992 samples: 73.48790322580645


 79%|███████▉  | 1240/1563 [13:46<03:25,  1.57it/s]


Batch 1240/1563 of epoch 5 complete. Loss per last 992 samples:: 0.36496986256491754 
Training Accuracy per last 992 samples: 72.98387096774194


 80%|███████▉  | 1249/1563 [13:52<03:19,  1.57it/s]

Epoch 5, batch 1250 complete! Training Loss : 0.3523771286249161
Epoch 5, batch 1250 complete! Training Accuracy : 0.7472


100%|██████████| 174/174 [00:38<00:00,  4.53it/s]
 80%|███████▉  | 1250/1563 [14:31<1:03:30, 12.17s/it]

Epoch 5, batch 1250 complete! Validation Loss : 2.040891580883114
Epoch 5, batch 1250 complete! Validation Accuracy : 0.3675305975521958


 83%|████████▎ | 1302/1563 [15:04<02:46,  1.57it/s]


Batch 1302/1563 of epoch 5 complete. Loss per last 992 samples:: 0.35971549730147084 
Training Accuracy per last 992 samples: 72.68145161290323


 87%|████████▋ | 1364/1563 [15:43<02:07,  1.57it/s]


Batch 1364/1563 of epoch 5 complete. Loss per last 992 samples:: 0.34919326439980536 
Training Accuracy per last 992 samples: 74.19354838709677


 91%|█████████ | 1426/1563 [16:23<01:27,  1.56it/s]


Batch 1426/1563 of epoch 5 complete. Loss per last 992 samples:: 0.3601541898904308 
Training Accuracy per last 992 samples: 73.58870967741936


 95%|█████████▍| 1481/1563 [16:58<00:51,  1.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 95%|█████████▌| 1488/1563 [17:02<00:47,  1.57it/s]


Batch 1488/1563 of epoch 5 complete. Loss per last 992 samples:: 0.33938891753073663 
Training Accuracy per last 992 samples: 74.6975806451613


 99%|█████████▉| 1550/1563 [17:42<00:08,  1.56it/s]


Batch 1550/1563 of epoch 5 complete. Loss per last 992 samples:: 0.35248026636339 
Training Accuracy per last 992 samples: 73.99193548387096


100%|█████████▉| 1562/1563 [17:49<00:00,  1.57it/s]

Epoch 5, batch 1563 complete! Training Loss : 0.35191390614286877
Epoch 5, batch 1563 complete! Training Accuracy : 0.7455287480494538


100%|██████████| 174/174 [00:38<00:00,  4.52it/s]
100%|██████████| 1563/1563 [18:28<00:00,  1.41it/s]

Epoch 5, batch 1563 complete! Validation Loss : 2.06698082712875
Epoch 5, batch 1563 complete! Validation Accuracy : 0.3675305975521958





In [None]:
# Check that we are using 100% of GPU memory footprint support libraries/code
# from https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip -q install gputil
!pip -q install psutil
!pip -q install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
 #!kill -9 -1

Gen RAM Free: 11.1 GB  | Proc size: 5.1 GB
GPU RAM Free: 10137MB | Used: 1304MB | Util  11% | Total 11441MB


In [None]:
printm()


Gen RAM Free: 11.1 GB  | Proc size: 5.1 GB
GPU RAM Free: 10137MB | Used: 1304MB | Util  11% | Total 11441MB
