# Description of the final project


This project is devoted to the question-answering task. You are going to work with the **DaNetQA** dataset.

DaNetQA is a question answering dataset for yes/no.

In [1]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl

In [35]:
! pip install mlflow --quiet
! nohup mlflow ui --port 5000 &

nohup: appending output to 'nohup.out'


In [3]:
import os
import sys
import warnings
import pprint

import json
import re
import string

from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from pymorphy2 import MorphAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             ConfusionMatrixDisplay, precision_score,
                             recall_score)
import torch
from torch.utils.data import (DataLoader, RandomSampler, random_split,
                              SequentialSampler, TensorDataset)
import tqdm
from transformers import (AdamW, BertForSequenceClassification,
                          BertModel, BertTokenizer, get_linear_schedule_with_warmup)

import nltk
nltk.download('punkt')

import mlflow
import mlflow.sklearn
import mlflow.pytorch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.8.2 optuna-3.5.0


In [5]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [36]:
MLFLOW_SERVER_URL = 'http://127.0.0.1:5000/'

warnings.filterwarnings("ignore")
np.random.seed(40)


# Data load and preprocess

In [7]:
! wget --no-check-certificate https://russiansuperglue.com/tasks/download/DaNetQA

! mv DaNetQA DaNetQA.zip

! unzip DaNetQA.zip

--2024-03-09 18:28:03--  https://russiansuperglue.com/tasks/download/DaNetQA
Resolving russiansuperglue.com (russiansuperglue.com)... 149.100.138.62
Connecting to russiansuperglue.com (russiansuperglue.com)|149.100.138.62|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 1293761 (1.2M) [application/zip]
Saving to: ‘DaNetQA’


2024-03-09 18:28:03 (11.1 MB/s) - ‘DaNetQA’ saved [1293761/1293761]

Archive:  DaNetQA.zip
   creating: DaNetQA/
  inflating: DaNetQA/train.jsonl     
   creating: __MACOSX/
   creating: __MACOSX/DaNetQA/
  inflating: __MACOSX/DaNetQA/._train.jsonl  
  inflating: DaNetQA/.DS_Store       
  inflating: __MACOSX/DaNetQA/._.DS_Store  
  inflating: DaNetQA/test.jsonl      
  inflating: __MACOSX/DaNetQA/._test.jsonl  
  inflating: DaNetQA/val.jsonl       
  inflating: __MACOSX/DaNetQA/._val.jsonl  
  inflating: __MACOSX/._DaNetQA      


In [8]:
def load_data(path):
    rows = []
    with open(path) as f:
        for line in f:
            rows.append(json.loads(line))

    df = pd.DataFrame(rows, index=[row.pop("idx") for row in rows])

    return df

In [9]:
train = load_data("DaNetQA/train.jsonl")
val = load_data("DaNetQA/val.jsonl")

In [10]:
train.describe()

Unnamed: 0,question,passage,label
count,1749,1749,1749
unique,1399,1552,2
top,Были ли в австралии аборигены?,"НА́ТО, Организа́ция Североатланти́ческого дого...",True
freq,6,10,1061


In [11]:
val.describe()

Unnamed: 0,question,passage,label
count,821,821,821
unique,572,781,2
top,Есть ли такая рыба лосось?,"«Лу́нный за́говор» — ряд теорий заговора, цент...",True
freq,5,3,412


In [12]:
len((train["question"] + train["passage"]).unique())

1749

In [13]:
len((val["question"] + val["passage"]).unique())

819

In [14]:
question_passage = val["question"] + val["passage"] + val["label"].astype(str)

question_passage[question_passage.duplicated(keep=False)]

14     Едят ли яйца веганы?Вега́нство  — наиболее стр...
185    Едят ли яйца веганы?Вега́нство  — наиболее стр...
446    Была ли в россии чума?Эпидемия чумы 1654—1655 ...
552    Была ли в россии чума?Эпидемия чумы 1654—1655 ...
dtype: object

In [15]:
val = val.drop_duplicates(ignore_index = True)

In [16]:
train["label"].value_counts()

True     1061
False     688
Name: label, dtype: int64

In [17]:
val["label"].value_counts()

True     411
False    408
Name: label, dtype: int64

In [18]:
train = train.groupby("label").sample(n=688)

In [19]:
train["text"] = train["question"] + " " + train["passage"]

val["text"] = val["question"] + " " + val["passage"]


In [20]:
train_y = train["label"]
val_y = val["label"]

In [21]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [24]:
bert_tokens = bert_tokenizer(train["text"].to_list(), padding=True,truncation=True, return_tensors="pt")
bert_tokens_test = bert_tokenizer(val["text"].to_list(), padding=True,truncation=True, return_tensors="pt")

In [26]:
bert_dataset = TensorDataset(
    bert_tokens["input_ids"],
    bert_tokens["attention_mask"],
    torch.tensor(train_y.to_numpy(), dtype=torch.long)
)
bert_dataset_test = TensorDataset(
    bert_tokens_test["input_ids"],
    bert_tokens_test["attention_mask"],
    torch.tensor(val_y.to_numpy(), dtype=torch.long)
)

In [27]:
bert_dataset_train, bert_dataset_eval = random_split(bert_dataset, [0.9, 0.1])


Since there are no labels for test data in the data, we will consider the validation data to be a test data, and the training data will be divided into validation and training data.

In [29]:
BATCH_SIZE = 8
bert_dataloader_train = DataLoader(bert_dataset_train,
                                   sampler=RandomSampler(bert_dataset_train),
                                   batch_size=BATCH_SIZE)
bert_dataloader_eval = DataLoader(bert_dataset_eval,
                                   sampler=SequentialSampler(bert_dataset_eval),
                                   batch_size=BATCH_SIZE)
bert_dataloader_test = DataLoader(bert_dataset_test,
                                   sampler=SequentialSampler(bert_dataset_test),
                                   batch_size=BATCH_SIZE)

# Training, logging in Mlflow, selection of hyperparameters

As you can see, metrics for test values ​​are also calculated in train, but this is not used in any way in the selection of hyperparameters and training. I just need it to quickly view test metrics later

In [30]:
def train_eval(train_dataset, eval_dataset, lr=3e-5, eps=1e-8, epoch=3):
  DEVICE = torch.device("cuda")
  bert_classifier = BertForSequenceClassification.from_pretrained(
                          "bert-base-multilingual-cased",
                          num_labels=2,
                          output_attentions=False,
                          output_hidden_states=False
                        )
  bert_classifier.to(DEVICE)

  EPOCHS = epoch

  optimizer = torch.optim.AdamW(bert_classifier.parameters(), lr=lr, eps=eps)

  total_steps = len(bert_dataloader_train) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

  torch.cuda.empty_cache()

  train_stats = []

  for epoch in range(EPOCHS):
      print(f"Epoch {epoch}")

      # Training
      print("Training")
      bert_classifier.train()

      train_losses = []
      for batch in tqdm.tqdm(bert_dataloader_train):
          input_ids = batch[0].to(DEVICE)
          attention_mask = batch[1].to(DEVICE)
          labels = batch[2].to(DEVICE)

          bert_classifier.zero_grad()

          output = bert_classifier(input_ids, attention_mask, labels=labels)
          loss = output.loss

          loss.backward()
          torch.nn.utils.clip_grad_norm_(bert_classifier.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

          train_losses.append(loss.item())

      train_loss = np.array(train_losses).mean()
      print(f"Mean loss = {train_loss}")

      # Evaluation
      print("Evaluation")
      bert_classifier.eval()

      eval_losses = []
      eval_accuracies = []

      for batch in tqdm.tqdm(bert_dataloader_eval):
          input_ids = batch[0].to(DEVICE)
          attention_mask = batch[1].to(DEVICE)
          labels = batch[2].to(DEVICE)

          with torch.no_grad():
              output = bert_classifier(input_ids, attention_mask, labels=labels)

          eval_losses.append(loss.item())

          predictions = np.argmax(output.logits.detach().cpu().numpy(), axis=1).flatten()
          accuracy = accuracy_score(y_true=labels.detach().cpu().numpy(), y_pred=predictions)
          eval_accuracies.append(accuracy)

      eval_loss = np.array(eval_losses).mean()
      eval_accuracy = np.array(eval_accuracies).mean()

      print(f"Mean loss = {eval_loss}, mean accuracy = {eval_accuracy}")

      test_losses = []
      test_accuracies = []

      for batch in tqdm.tqdm(bert_dataloader_test):
          input_ids = batch[0].to(DEVICE)
          attention_mask = batch[1].to(DEVICE)
          labels = batch[2].to(DEVICE)

          with torch.no_grad():
              output = bert_classifier(input_ids, attention_mask, labels=labels)

          test_losses.append(loss.item())

          predictions = np.argmax(output.logits.detach().cpu().numpy(), axis=1).flatten()
          accuracy = accuracy_score(y_true=labels.detach().cpu().numpy(), y_pred=predictions)
          test_accuracies.append(accuracy)

      test_loss = np.array(test_losses).mean()
      test_accuracy = np.array(test_accuracies).mean()

      train_stats.append({
          "epoch": epoch,
          "train loss": train_loss,
          "eval loss": eval_loss,
          "eval accuracy": eval_accuracy,
      })
  return eval_accuracy, test_accuracy, bert_classifier

In [31]:
def objective(trial):
  with mlflow.start_run():
    epoch = trial.suggest_int("epoch", 2, 5)
    eps = trial.suggest_float('eps', 1e-8, 1e-4, log=True)
    lr = trial.suggest_float('lr', 1e-6, 1e-4, log=True)
    eval_accuracy, test_accuracy, bert_classifier = train_eval(bert_dataloader_train, bert_dataloader_eval, lr=lr, eps=eps, epoch=epoch)
    mlflow.log_param("epoch", epoch)
    mlflow.log_param("eps", eps)
    mlflow.log_param("lr", lr)
    mlflow.log_metric("eval accuracy", eval_accuracy)
    mlflow.log_metric("test accuracy", test_accuracy)
    components = {
        "model": bert_classifier,
        "tokenizer": bert_tokenizer,
        }
    mlflow.transformers.log_model(
        transformers_model=components,
        artifact_path='model',
        pip_requirements = ['torch==2.1.0', 'torchvision==0.16.0', 'jaxlib==0.4.23'],
        task="text-classification",
    )
  return eval_accuracy

In [37]:
mlflow.set_tracking_uri(MLFLOW_SERVER_URL)

experiment_name = 'experiment'
mlflow.set_experiment(experiment_name)

2024/03/09 18:35:01 INFO mlflow.tracking.fluent: Experiment with name 'experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/872536664353035649', creation_time=1710009301648, experiment_id='872536664353035649', last_update_time=1710009301648, lifecycle_stage='active', name='experiment', tags={}>

In [38]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5 ,show_progress_bar=True)

  0%|          | 0/5 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:02<05:34,  2.17s/it][A
  1%|▏         | 2/155 [00:02<03:17,  1.29s/it][A
  2%|▏         | 3/155 [00:03<02:35,  1.02s/it][A
  3%|▎         | 4/155 [00:04<02:14,  1.12it/s][A
  3%|▎         | 5/155 [00:04<02:02,  1.22it/s][A
  4%|▍         | 6/155 [00:05<01:55,  1.29it/s][A
  5%|▍         | 7/155 [00:06<01:50,  1.34it/s][A
  5%|▌         | 8/155 [00:07<01:47,  1.37it/s][A
  6%|▌         | 9/155 [00:07<01:45,  1.38it/s][A
  6%|▋         | 10/155 [00:08<01:43,  1.40it/s][A
  7%|▋         | 11/155 [00:09<01:41,  1.42it/s][A
  8%|▊         | 12/155 [00:09<01:40,  1.42it/s][A
  8%|▊         | 13/155 [00:10<01:39,  1.42it/s][A
  9%|▉         | 14/155 [00:11<01:38,  1.43it/s][A
 10%|▉         | 15/155 [00:11<01:37,  1.43it/s][A
 10%|█         | 16/155 [00:12<01:36,  1.44it/s][A
 11%|█         | 17/155 [00:13<01:35,  1.44it/s][A
 12%|█▏        | 18/155 [00:13<01:35,  1.43it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6491556638671506
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.12it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.18it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  4.15it/s][A
 22%|██▏       | 4/18 [00:00<00:03,  4.07it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  4.12it/s][A
 33%|███▎      | 6/18 [00:01<00:02,  4.15it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  4.14it/s][A
 44%|████▍     | 8/18 [00:01<00:02,  4.10it/s][A
 50%|█████     | 9/18 [00:02<00:02,  4.12it/s][A
 56%|█████▌    | 10/18 [00:02<00:01,  4.13it/s][A
 61%|██████    | 11/18 [00:02<00:01,  4.13it/s][A
 67%|██████▋   | 12/18 [00:02<00:01,  4.10it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  4.11it/s][A
 78%|███████▊  | 14/18 [00:03<00:00,  4.13it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  4.14it/s][A
 89%|████████▉ | 16/18 [00:03<00:00,  4.10it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.31it/s]


Mean loss = 0.5390511751174927, mean accuracy = 0.6805555555555556



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:18,  5.57it/s][A
  2%|▏         | 2/103 [00:00<00:18,  5.38it/s][A
  3%|▎         | 3/103 [00:00<00:18,  5.35it/s][A
  4%|▍         | 4/103 [00:00<00:18,  5.23it/s][A
  5%|▍         | 5/103 [00:00<00:19,  5.11it/s][A
  6%|▌         | 6/103 [00:01<00:18,  5.16it/s][A
  7%|▋         | 7/103 [00:01<00:18,  5.16it/s][A
  8%|▊         | 8/103 [00:01<00:18,  5.18it/s][A
  9%|▊         | 9/103 [00:01<00:18,  5.15it/s][A
 10%|▉         | 10/103 [00:01<00:18,  5.02it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.94it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.89it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.82it/s][A
 14%|█▎        | 14/103 [00:02<00:17,  4.96it/s][A
 15%|█▍        | 15/103 [00:02<00:17,  4.96it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  5.00it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.96it/s][A
 17%|█▋        | 18/103 [00:03<00:16,  5.03it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 1
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:51,  1.38it/s][A
  1%|▏         | 2/155 [00:01<01:51,  1.37it/s][A
  2%|▏         | 3/155 [00:02<01:52,  1.35it/s][A
  3%|▎         | 4/155 [00:02<01:52,  1.35it/s][A
  3%|▎         | 5/155 [00:03<01:51,  1.35it/s][A
  4%|▍         | 6/155 [00:04<01:50,  1.35it/s][A
  5%|▍         | 7/155 [00:05<01:50,  1.35it/s][A
  5%|▌         | 8/155 [00:05<01:49,  1.35it/s][A
  6%|▌         | 9/155 [00:06<01:48,  1.34it/s][A
  6%|▋         | 10/155 [00:07<01:47,  1.34it/s][A
  7%|▋         | 11/155 [00:08<01:47,  1.34it/s][A
  8%|▊         | 12/155 [00:08<01:46,  1.34it/s][A
  8%|▊         | 13/155 [00:09<01:45,  1.34it/s][A
  9%|▉         | 14/155 [00:10<01:44,  1.35it/s][A
 10%|▉         | 15/155 [00:11<01:44,  1.34it/s][A
 10%|█         | 16/155 [00:11<01:43,  1.35it/s][A
 11%|█         | 17/155 [00:12<01:42,  1.34it/s][A
 12%|█▏        | 18/155 [00:13<01:42,  1.34it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5909934734144519
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.12it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.05it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  4.03it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.94it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.99it/s][A
 33%|███▎      | 6/18 [00:01<00:02,  4.03it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  4.04it/s][A
 44%|████▍     | 8/18 [00:01<00:02,  4.00it/s][A
 50%|█████     | 9/18 [00:02<00:02,  4.01it/s][A
 56%|█████▌    | 10/18 [00:02<00:01,  4.03it/s][A
 61%|██████    | 11/18 [00:02<00:01,  4.05it/s][A
 67%|██████▋   | 12/18 [00:02<00:01,  4.02it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  4.00it/s][A
 78%|███████▊  | 14/18 [00:03<00:00,  4.04it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  4.03it/s][A
 89%|████████▉ | 16/18 [00:03<00:00,  3.99it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.21it/s]


Mean loss = 0.6009760499000549, mean accuracy = 0.6944444444444444



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.17it/s][A
  2%|▏         | 2/103 [00:00<00:19,  5.15it/s][A
  3%|▎         | 3/103 [00:00<00:19,  5.07it/s][A
  4%|▍         | 4/103 [00:00<00:19,  5.02it/s][A
  5%|▍         | 5/103 [00:00<00:19,  4.96it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.98it/s][A
  7%|▋         | 7/103 [00:01<00:19,  5.01it/s][A
  8%|▊         | 8/103 [00:01<00:18,  5.01it/s][A
  9%|▊         | 9/103 [00:01<00:18,  4.97it/s][A
 10%|▉         | 10/103 [00:01<00:18,  4.97it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.98it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  5.01it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  5.00it/s][A
 14%|█▎        | 14/103 [00:02<00:17,  4.97it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.95it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.95it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.98it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.98it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 2
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:53,  1.36it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:55,  1.32it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.31it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.31it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.31it/s][A
  6%|▌         | 9/155 [00:06<01:51,  1.31it/s][A
  6%|▋         | 10/155 [00:07<01:50,  1.31it/s][A
  7%|▋         | 11/155 [00:08<01:49,  1.31it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.31it/s][A
  8%|▊         | 13/155 [00:09<01:48,  1.31it/s][A
  9%|▉         | 14/155 [00:10<01:47,  1.31it/s][A
 10%|▉         | 15/155 [00:11<01:46,  1.31it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.31it/s][A
 11%|█         | 17/155 [00:12<01:45,  1.31it/s][A
 12%|█▏        | 18/155 [00:13<01:44,  1.31it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5254269302852692
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.98it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.02it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.99it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.93it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.93it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.94it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.93it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.92it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.93it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.93it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.93it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.93it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.95it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.93it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.93it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.13it/s]


Mean loss = 0.4293779730796814, mean accuracy = 0.7013888888888888



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.27it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.05it/s][A
  3%|▎         | 3/103 [00:00<00:19,  5.02it/s][A
  4%|▍         | 4/103 [00:00<00:19,  5.01it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.95it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.97it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.94it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.95it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.95it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.93it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.94it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.94it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.94it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.92it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.91it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.89it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 3
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:53,  1.36it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.33it/s][A
  2%|▏         | 3/155 [00:02<01:55,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:54,  1.31it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:54,  1.29it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.29it/s][A
  6%|▋         | 10/155 [00:07<01:52,  1.29it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:50,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.4846644564021018
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.95it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.91it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.92it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.88it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.94it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.93it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.93it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.91it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.93it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.93it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.92it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.91it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.93it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.90it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.89it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.18778732419013977, mean accuracy = 0.7083333333333334



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.17it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.02it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.99it/s][A
  4%|▍         | 4/103 [00:00<00:19,  4.99it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.92it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.94it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.94it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.95it/s][A
  9%|▊         | 9/103 [00:01<00:18,  4.95it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.94it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.94it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.93it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.92it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.89it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.88it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.90it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.92it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 4
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:57,  1.32it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:47,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.29it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.442835055531994
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.00it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.90it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.95it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.78it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.79it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.85it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.86it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.81it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.83it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.86it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.84it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.81it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.83it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.84it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.85it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.83it/s][A
100%|██████████| 18/18 [00:04<00:00,  3.98it/s]


Mean loss = 0.5861777067184448, mean accuracy = 0.7152777777777778



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:21,  4.83it/s][A
  2%|▏         | 2/103 [00:00<00:21,  4.73it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.80it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.88it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.76it/s][A
  6%|▌         | 6/103 [00:01<00:20,  4.77it/s][A
  7%|▋         | 7/103 [00:01<00:20,  4.77it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.81it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.77it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.76it/s][A
 11%|█         | 11/103 [00:02<00:19,  4.75it/s][A
 12%|█▏        | 12/103 [00:02<00:19,  4.77it/s][A
 13%|█▎        | 13/103 [00:02<00:19,  4.67it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.73it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.73it/s][A
 16%|█▌        | 16/103 [00:03<00:18,  4.77it/s][A
 17%|█▋        | 17/103 [00:03<00:18,  4.74it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.77it/s][A
 18%|█▊        | 19/103 [00:0

README.md:   0%|          | 0.00/7.10k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:53,  1.35it/s][A
  1%|▏         | 2/155 [00:01<01:54,  1.34it/s][A
  2%|▏         | 3/155 [00:02<01:54,  1.33it/s][A
  3%|▎         | 4/155 [00:03<01:53,  1.33it/s][A
  3%|▎         | 5/155 [00:03<01:52,  1.33it/s][A
  4%|▍         | 6/155 [00:04<01:51,  1.33it/s][A
  5%|▍         | 7/155 [00:05<01:51,  1.33it/s][A
  5%|▌         | 8/155 [00:06<01:50,  1.32it/s][A
  6%|▌         | 9/155 [00:06<01:49,  1.33it/s][A
  6%|▋         | 10/155 [00:07<01:49,  1.32it/s][A
  7%|▋         | 11/155 [00:08<01:48,  1.33it/s][A
  8%|▊         | 12/155 [00:09<01:48,  1.32it/s][A
  8%|▊         | 13/155 [00:09<01:47,  1.32it/s][A
  9%|▉         | 14/155 [00:10<01:46,  1.32it/s][A
 10%|▉         | 15/155 [00:11<01:46,  1.32it/s][A
 10%|█         | 16/155 [00:12<01:44,  1.33it/s][A
 11%|█         | 17/155 [00:12<01:44,  1.32it/s][A
 12%|█▏        | 18/155 [00:13<01:43,  1.32it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6589079385803592
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.83it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.90it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.99it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.81it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.86it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.88it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.89it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.85it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.87it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.88it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.93it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.91it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.87it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.88it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.92it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.92it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.06it/s]


Mean loss = 0.7203549146652222, mean accuracy = 0.7222222222222222



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.20it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.05it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.90it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.93it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.89it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.93it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.94it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.86it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.85it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.84it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.90it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.90it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.87it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.83it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.87it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.91it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.94it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.94it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 1
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:54,  1.34it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.30it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:54,  1.31it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.31it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:48,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6010093950456189
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.97it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.95it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.95it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.89it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.91it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.90it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.93it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.91it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.92it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.90it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.90it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.93it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.93it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.92it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.90it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.7443450689315796, mean accuracy = 0.7291666666666666



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.17it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.01it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.99it/s][A
  4%|▍         | 4/103 [00:00<00:19,  4.98it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.92it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.94it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.92it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.91it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.89it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.85it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.87it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.88it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.90it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.89it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.87it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.85it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.85it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.89it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 2
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:57,  1.31it/s][A
  1%|▏         | 2/155 [00:01<01:57,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.30it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:55,  1.29it/s][A
  5%|▍         | 7/155 [00:05<01:54,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:50,  1.30it/s][A
  8%|▊         | 13/155 [00:10<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:48,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:47,  1.29it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.29it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.29it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5486995531666663
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.97it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.95it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.94it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.88it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.91it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.93it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.92it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.90it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.93it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.93it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.92it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.89it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.90it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.89it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.88it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.10it/s]


Mean loss = 0.3537146747112274, mean accuracy = 0.7569444444444444



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.14it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.00it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.96it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.94it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.86it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.88it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.88it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.90it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.88it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.89it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.91it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.87it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.87it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.86it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.88it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.88it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.88it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.85it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 0
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:58,  1.30it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:54,  1.33it/s][A
  3%|▎         | 4/155 [00:03<01:53,  1.34it/s][A
  3%|▎         | 5/155 [00:03<01:52,  1.33it/s][A
  4%|▍         | 6/155 [00:04<01:51,  1.33it/s][A
  5%|▍         | 7/155 [00:05<01:51,  1.33it/s][A
  5%|▌         | 8/155 [00:06<01:50,  1.33it/s][A
  6%|▌         | 9/155 [00:06<01:49,  1.33it/s][A
  6%|▋         | 10/155 [00:07<01:48,  1.33it/s][A
  7%|▋         | 11/155 [00:08<01:47,  1.33it/s][A
  8%|▊         | 12/155 [00:09<01:47,  1.33it/s][A
  8%|▊         | 13/155 [00:09<01:46,  1.33it/s][A
  9%|▉         | 14/155 [00:10<01:45,  1.33it/s][A
 10%|▉         | 15/155 [00:11<01:45,  1.33it/s][A
 10%|█         | 16/155 [00:12<01:44,  1.33it/s][A
 11%|█         | 17/155 [00:12<01:43,  1.33it/s][A
 12%|█▏        | 18/155 [00:13<01:43,  1.33it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6814365571545017
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.01it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.02it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  4.00it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.91it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.94it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.95it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.96it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.93it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.94it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.96it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.95it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.92it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.94it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.95it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.94it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.92it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.14it/s]


Mean loss = 0.7351433634757996, mean accuracy = 0.6458333333333334



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.19it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.99it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.95it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.95it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.93it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.96it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.95it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.93it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.92it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.93it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.96it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.96it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.91it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.92it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.93it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.95it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.95it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.93it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 1
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:54,  1.35it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:55,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.31it/s][A
  3%|▎         | 5/155 [00:03<01:54,  1.31it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.31it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6331237581468397
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.00it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.96it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.95it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.90it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.92it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.91it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.90it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.89it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.92it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.91it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.89it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.90it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.92it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.90it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.88it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.10it/s]


Mean loss = 0.665511965751648, mean accuracy = 0.6527777777777778



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:20,  5.09it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.93it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.91it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.94it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.93it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.94it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.92it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.90it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.91it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.89it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.87it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.86it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.86it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.90it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.91it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.91it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.86it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 2
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:54,  1.34it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.31it/s][A
  3%|▎         | 5/155 [00:03<01:54,  1.31it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:47,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.29it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5941859416423305
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.91it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.90it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.89it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.87it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.88it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.88it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.87it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.87it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.90it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.90it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.88it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.88it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.92it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.88it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.88it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.09it/s]


Mean loss = 0.638290286064148, mean accuracy = 0.7013888888888888



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:20,  5.10it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.94it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.87it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.90it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.88it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.88it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.89it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.87it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.86it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.86it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.85it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.87it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.88it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.85it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.84it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.86it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.88it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.89it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 3
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:56,  1.32it/s][A
  1%|▏         | 2/155 [00:01<01:57,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.30it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5724419843765998
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.98it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.94it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.96it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.89it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.90it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.89it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.90it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.89it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.89it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.91it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.90it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.88it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.88it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.92it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.90it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.88it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.09it/s]


Mean loss = 0.44306132197380066, mean accuracy = 0.6666666666666666



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:20,  5.10it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.93it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.86it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.86it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.85it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.86it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.86it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.86it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.85it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.84it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.86it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.88it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.89it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.88it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.85it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.84it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.86it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.87it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 4
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:55,  1.34it/s][A
  1%|▏         | 2/155 [00:01<01:57,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:57,  1.29it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:56,  1.29it/s][A
  4%|▍         | 6/155 [00:04<01:55,  1.29it/s][A
  5%|▍         | 7/155 [00:05<01:54,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.29it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:10<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5669480400700723
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.02it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.01it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.97it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.90it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.89it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.91it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.90it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.86it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.90it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.90it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.88it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.88it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.89it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.88it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.87it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.10it/s]


Mean loss = 0.472719669342041, mean accuracy = 0.6666666666666666



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.16it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.98it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.88it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.91it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.89it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.90it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.90it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.88it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.85it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.86it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.89it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.90it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.89it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.89it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.87it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.87it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.88it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.87it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 0
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:56,  1.32it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:54,  1.32it/s][A
  3%|▎         | 4/155 [00:03<01:53,  1.33it/s][A
  3%|▎         | 5/155 [00:03<01:52,  1.33it/s][A
  4%|▍         | 6/155 [00:04<01:51,  1.33it/s][A
  5%|▍         | 7/155 [00:05<01:51,  1.33it/s][A
  5%|▌         | 8/155 [00:06<01:50,  1.33it/s][A
  6%|▌         | 9/155 [00:06<01:49,  1.33it/s][A
  6%|▋         | 10/155 [00:07<01:48,  1.33it/s][A
  7%|▋         | 11/155 [00:08<01:48,  1.33it/s][A
  8%|▊         | 12/155 [00:09<01:47,  1.33it/s][A
  8%|▊         | 13/155 [00:09<01:46,  1.33it/s][A
  9%|▉         | 14/155 [00:10<01:46,  1.33it/s][A
 10%|▉         | 15/155 [00:11<01:45,  1.33it/s][A
 10%|█         | 16/155 [00:12<01:44,  1.33it/s][A
 11%|█         | 17/155 [00:12<01:44,  1.32it/s][A
 12%|█▏        | 18/155 [00:13<01:43,  1.32it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6790659396879135
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.03it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.01it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  4.01it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.93it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.96it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.96it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.97it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.93it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.99it/s][A
 56%|█████▌    | 10/18 [00:02<00:01,  4.01it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.98it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.95it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.96it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.99it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.96it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.94it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.16it/s]


Mean loss = 0.7119530439376831, mean accuracy = 0.6666666666666666



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.25it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.05it/s][A
  3%|▎         | 3/103 [00:00<00:19,  5.00it/s][A
  4%|▍         | 4/103 [00:00<00:19,  5.00it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.96it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.98it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.96it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.96it/s][A
  9%|▊         | 9/103 [00:01<00:18,  4.96it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.95it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.95it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.96it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.98it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.94it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.94it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.94it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.96it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.95it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 1
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:55,  1.33it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.31it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:51,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:50,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:48,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6258625638100409
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.97it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.93it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.94it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.89it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.90it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.93it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.91it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.89it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.94it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.92it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.92it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.90it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.90it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.92it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.91it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.5730161666870117, mean accuracy = 0.7222222222222222



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.12it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.93it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.92it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.92it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.89it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.94it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.93it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.93it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.93it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.92it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.93it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.89it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.87it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.84it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.86it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.88it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.87it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 2
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:54,  1.34it/s][A
  1%|▏         | 2/155 [00:01<01:55,  1.32it/s][A
  2%|▏         | 3/155 [00:02<01:55,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.29it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.29it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5392707660313576
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.02it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.01it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.98it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.88it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.90it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.90it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.91it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.89it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.92it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.92it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.90it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.91it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.92it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.91it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.90it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.6098636984825134, mean accuracy = 0.7291666666666666



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:20,  5.10it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.94it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.89it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.90it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.87it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.89it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.91it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.89it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.91it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.89it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.87it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.87it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.88it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.90it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.90it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.87it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.85it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 3
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:57,  1.31it/s][A
  1%|▏         | 2/155 [00:01<01:57,  1.30it/s][A
  2%|▏         | 3/155 [00:02<01:57,  1.30it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.29it/s][A
  4%|▍         | 6/155 [00:04<01:55,  1.29it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.29it/s][A
  7%|▋         | 11/155 [00:08<01:51,  1.29it/s][A
  8%|▊         | 12/155 [00:09<01:50,  1.29it/s][A
  8%|▊         | 13/155 [00:10<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:49,  1.29it/s][A
 10%|▉         | 15/155 [00:11<01:48,  1.29it/s][A
 10%|█         | 16/155 [00:12<01:47,  1.29it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.29it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.4668250419439808
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.98it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.95it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.94it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.89it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.92it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.91it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.93it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.91it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.93it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.92it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.91it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.90it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.91it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.90it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.89it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.13651679456233978, mean accuracy = 0.75



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.13it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.00it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.92it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.91it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.87it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.89it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.91it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.91it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.92it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.89it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.87it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.86it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.86it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.88it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.91it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.91it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.89it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.87it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 4
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:56,  1.33it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.31it/s][A
  3%|▎         | 5/155 [00:03<01:54,  1.31it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:53,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.29it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:50,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.29it/s][A
 10%|▉         | 15/155 [00:11<01:48,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:47,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.29it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.36493218041235403
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.90it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.90it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.90it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.87it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.90it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.89it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.87it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.88it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.91it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.89it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.89it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.91it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.89it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.89it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.08it/s]


Mean loss = 0.11713530868291855, mean accuracy = 0.7361111111111112



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.16it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.96it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.87it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.90it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.88it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.90it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.88it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.85it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.85it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.87it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.86it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.89it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.87it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.87it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.85it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.87it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.88it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.88it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 0
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:57,  1.31it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:54,  1.32it/s][A
  3%|▎         | 4/155 [00:03<01:53,  1.33it/s][A
  3%|▎         | 5/155 [00:03<01:52,  1.33it/s][A
  4%|▍         | 6/155 [00:04<01:51,  1.33it/s][A
  5%|▍         | 7/155 [00:05<01:51,  1.33it/s][A
  5%|▌         | 8/155 [00:06<01:50,  1.33it/s][A
  6%|▌         | 9/155 [00:06<01:49,  1.33it/s][A
  6%|▋         | 10/155 [00:07<01:49,  1.33it/s][A
  7%|▋         | 11/155 [00:08<01:48,  1.33it/s][A
  8%|▊         | 12/155 [00:09<01:47,  1.33it/s][A
  8%|▊         | 13/155 [00:09<01:46,  1.33it/s][A
  9%|▉         | 14/155 [00:10<01:45,  1.33it/s][A
 10%|▉         | 15/155 [00:11<01:45,  1.33it/s][A
 10%|█         | 16/155 [00:12<01:44,  1.33it/s][A
 11%|█         | 17/155 [00:12<01:44,  1.32it/s][A
 12%|█▏        | 18/155 [00:13<01:43,  1.32it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.6612190227354726
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  4.06it/s][A
 11%|█         | 2/18 [00:00<00:03,  4.03it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  4.00it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.93it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.95it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.98it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.97it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.95it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.96it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.96it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.97it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.95it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.96it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.95it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.96it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.93it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.15it/s]


Mean loss = 0.43166637420654297, mean accuracy = 0.7013888888888888



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.21it/s][A
  2%|▏         | 2/103 [00:00<00:19,  5.10it/s][A
  3%|▎         | 3/103 [00:00<00:19,  5.01it/s][A
  4%|▍         | 4/103 [00:00<00:19,  4.98it/s][A
  5%|▍         | 5/103 [00:01<00:19,  4.93it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.97it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.99it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.94it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.94it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.91it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.94it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.96it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.95it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.93it/s][A
 15%|█▍        | 15/103 [00:03<00:17,  4.91it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.92it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.95it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.94it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 1
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:56,  1.32it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:55,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:54,  1.31it/s][A
  4%|▍         | 6/155 [00:04<01:53,  1.31it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.31it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.31it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.31it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:48,  1.31it/s][A
  9%|▉         | 14/155 [00:10<01:47,  1.31it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.31it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.31it/s][A
 11%|█         | 17/155 [00:13<01:45,  1.31it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5847603361452779
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.96it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.95it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.97it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.91it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.93it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.94it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.93it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.91it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.91it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.93it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.91it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.92it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.93it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.92it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.91it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.6008778214454651, mean accuracy = 0.7222222222222222



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:20,  5.09it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.97it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.90it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.90it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.88it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.91it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.94it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.92it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.92it/s][A
 10%|▉         | 10/103 [00:02<00:18,  4.91it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.91it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.91it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.87it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.86it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.87it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.90it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.93it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.92it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 2
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:57,  1.32it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.31it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:50,  1.30it/s][A
  8%|▊         | 13/155 [00:10<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:49,  1.29it/s][A
 10%|▉         | 15/155 [00:11<01:48,  1.29it/s][A
 10%|█         | 16/155 [00:12<01:47,  1.29it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.29it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.5087394648982633
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.94it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.94it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.94it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.89it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.93it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.93it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.95it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.92it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.92it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.92it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.91it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.89it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.93it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.92it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.93it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.89it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.11it/s]


Mean loss = 0.4410969913005829, mean accuracy = 0.7152777777777778



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.23it/s][A
  2%|▏         | 2/103 [00:00<00:20,  5.03it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.97it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.95it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.89it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.90it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.90it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.89it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.89it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.88it/s][A
 11%|█         | 11/103 [00:02<00:18,  4.88it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.87it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.86it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.87it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.89it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.92it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.93it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.92it/s][A
 18%|█▊        | 19/103 [00:0

Epoch 3
Training



  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:56,  1.32it/s][A
  1%|▏         | 2/155 [00:01<01:56,  1.31it/s][A
  2%|▏         | 3/155 [00:02<01:56,  1.30it/s][A
  3%|▎         | 4/155 [00:03<01:56,  1.30it/s][A
  3%|▎         | 5/155 [00:03<01:55,  1.30it/s][A
  4%|▍         | 6/155 [00:04<01:54,  1.30it/s][A
  5%|▍         | 7/155 [00:05<01:53,  1.30it/s][A
  5%|▌         | 8/155 [00:06<01:52,  1.30it/s][A
  6%|▌         | 9/155 [00:06<01:52,  1.30it/s][A
  6%|▋         | 10/155 [00:07<01:51,  1.30it/s][A
  7%|▋         | 11/155 [00:08<01:50,  1.30it/s][A
  8%|▊         | 12/155 [00:09<01:49,  1.30it/s][A
  8%|▊         | 13/155 [00:09<01:49,  1.30it/s][A
  9%|▉         | 14/155 [00:10<01:48,  1.30it/s][A
 10%|▉         | 15/155 [00:11<01:47,  1.30it/s][A
 10%|█         | 16/155 [00:12<01:46,  1.30it/s][A
 11%|█         | 17/155 [00:13<01:46,  1.30it/s][A
 12%|█▏        | 18/155 [00:13<01:45,  1.30it/s][A
 12%|█▏        | 19/155 [00:1

Mean loss = 0.4662489212328388
Evaluation



  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:00<00:04,  3.88it/s][A
 11%|█         | 2/18 [00:00<00:04,  3.93it/s][A
 17%|█▋        | 3/18 [00:00<00:03,  3.91it/s][A
 22%|██▏       | 4/18 [00:01<00:03,  3.87it/s][A
 28%|██▊       | 5/18 [00:01<00:03,  3.91it/s][A
 33%|███▎      | 6/18 [00:01<00:03,  3.90it/s][A
 39%|███▉      | 7/18 [00:01<00:02,  3.89it/s][A
 44%|████▍     | 8/18 [00:02<00:02,  3.88it/s][A
 50%|█████     | 9/18 [00:02<00:02,  3.91it/s][A
 56%|█████▌    | 10/18 [00:02<00:02,  3.90it/s][A
 61%|██████    | 11/18 [00:02<00:01,  3.89it/s][A
 67%|██████▋   | 12/18 [00:03<00:01,  3.90it/s][A
 72%|███████▏  | 13/18 [00:03<00:01,  3.90it/s][A
 78%|███████▊  | 14/18 [00:03<00:01,  3.90it/s][A
 83%|████████▎ | 15/18 [00:03<00:00,  3.89it/s][A
 89%|████████▉ | 16/18 [00:04<00:00,  3.89it/s][A
100%|██████████| 18/18 [00:04<00:00,  4.09it/s]


Mean loss = 0.3864991366863251, mean accuracy = 0.7291666666666666



  0%|          | 0/103 [00:00<?, ?it/s][A
  1%|          | 1/103 [00:00<00:19,  5.15it/s][A
  2%|▏         | 2/103 [00:00<00:20,  4.99it/s][A
  3%|▎         | 3/103 [00:00<00:20,  4.95it/s][A
  4%|▍         | 4/103 [00:00<00:20,  4.94it/s][A
  5%|▍         | 5/103 [00:01<00:20,  4.86it/s][A
  6%|▌         | 6/103 [00:01<00:19,  4.88it/s][A
  7%|▋         | 7/103 [00:01<00:19,  4.86it/s][A
  8%|▊         | 8/103 [00:01<00:19,  4.88it/s][A
  9%|▊         | 9/103 [00:01<00:19,  4.89it/s][A
 10%|▉         | 10/103 [00:02<00:19,  4.85it/s][A
 11%|█         | 11/103 [00:02<00:19,  4.84it/s][A
 12%|█▏        | 12/103 [00:02<00:18,  4.86it/s][A
 13%|█▎        | 13/103 [00:02<00:18,  4.90it/s][A
 14%|█▎        | 14/103 [00:02<00:18,  4.91it/s][A
 15%|█▍        | 15/103 [00:03<00:18,  4.88it/s][A
 16%|█▌        | 16/103 [00:03<00:17,  4.87it/s][A
 17%|█▋        | 17/103 [00:03<00:17,  4.83it/s][A
 17%|█▋        | 18/103 [00:03<00:17,  4.87it/s][A
 18%|█▊        | 19/103 [00:0

# Results

In [39]:
from mlflow.tracking import MlflowClient
from pprint import pprint

In [40]:
client = MlflowClient(MLFLOW_SERVER_URL)
experiment = client.get_experiment_by_name(experiment_name)
run_info = client.search_runs(experiment.experiment_id)[0]

print(experiment)
print(run_info)

<Experiment: artifact_location='mlflow-artifacts:/872536664353035649', creation_time=1710009301648, experiment_id='872536664353035649', last_update_time=1710009301648, lifecycle_stage='active', name='experiment', tags={}>
<Run: data=<RunData: metrics={'eval accuracy': 0.7291666666666666, 'test accuracy': 0.5768608414239482}, params={'epoch': '4', 'eps': '1.2068824117439967e-07', 'lr': '6.74463305911642e-06'}, tags={'mlflow.log-model.history': '[{"run_id": "11b030207eee423dbe205d4675520ee3", '
                             '"artifact_path": "model", "utc_time_created": '
                             '"2024-03-09 19:29:46.925828", "flavors": '
                             '{"transformers": {"transformers_version": '
                             '"4.38.2", "code": null, "task": '
                             '"text-classification", "instance_type": '
                             '"TextClassificationPipeline", "framework": "pt", '
                             '"pipeline_model_type": '
     

In [48]:
reg_model_name = "bert-fine-tune"

client.create_registered_model(reg_model_name)

result = client.create_model_version(
    name=reg_model_name,
    source=f"{run_info.info.artifact_uri}/model",
    run_id=run_info.info.run_id
)

print(result)

2024/03/09 19:33:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bert-fine-tune, version 1


<ModelVersion: aliases=[], creation_timestamp=1710012780021, current_stage='None', description='', last_updated_timestamp=1710012780021, name='bert-fine-tune', run_id='11b030207eee423dbe205d4675520ee3', run_link='', source='mlflow-artifacts:/872536664353035649/11b030207eee423dbe205d4675520ee3/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>


Let's choose the best model based on the best validation sample metrics

In [50]:
run_infos = client.search_runs(experiment.experiment_id)
best_metric = 0
best_run_id = None
best_run_artifact_uri = None
for i in run_infos:
  staging_metrics = client.get_run(i.info.run_id).data.metrics['eval accuracy']
  if staging_metrics > best_metric:
    best_metric = staging_metrics
    best_run_id = i.info.run_id
    best_run_artifact_uri = i.info.artifact_uri

We put the best model in Production

In [53]:
result = client.create_model_version(
    name=reg_model_name,
    source=f"{best_run_artifact_uri}/model",
    run_id=best_run_id
)
client.transition_model_version_stage(
    name=reg_model_name,
    version=result.version,
    stage="Production"
)

2024/03/09 19:39:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bert-fine-tune, version 2


<ModelVersion: aliases=[], creation_timestamp=1710013182557, current_stage='None', description='', last_updated_timestamp=1710013182557, name='bert-fine-tune', run_id='1a78d29dd3c84b578a106e7a05193a8a', run_link='', source='mlflow-artifacts:/872536664353035649/1a78d29dd3c84b578a106e7a05193a8a/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>


In [54]:
current_prod = [v for v in client.search_model_versions(f"name='{reg_model_name}'") if v.current_stage == 'Production'][-1]

In [55]:
current_prod

<ModelVersion: aliases=[], creation_timestamp=1710013182557, current_stage='Production', description='', last_updated_timestamp=1710013182576, name='bert-fine-tune', run_id='1a78d29dd3c84b578a106e7a05193a8a', run_link='', source='mlflow-artifacts:/872536664353035649/1a78d29dd3c84b578a106e7a05193a8a/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>


As you can see from the test sample, the model turned out to be overtrained. It was necessary to take a larger validation sample.

In [56]:
test_metric = client.get_run(best_run_id).data.metrics['test accuracy']

In [57]:
test_metric

0.5635113268608414


Save and check loading the model

In [59]:
import pickle

loaded_pipeline = mlflow.transformers.load_model(f"{best_run_artifact_uri}/model", return_type="pipeline")


with open('model.pickle', 'wb') as f:
  pickle.dump(loaded_pipeline, f)

with open('model.pickle', 'rb') as f:
  loaded_pipeline_loaded = pickle.load(f)

Downloading artifacts:   0%|          | 0/14 [00:00<?, ?it/s]

2024/03/09 19:44:27 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Just checking, text have no senese

In [60]:
loaded_pipeline_loaded.predict(["Все будет хорошо"])

[{'label': 'LABEL_0', 'score': 0.6430174112319946}]