In [1]:
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.0.4.tar.gz (407 kB)
[K     |████████████████████████████████| 407 kB 8.0 MB/s 
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.0.4-py3-none-any.whl size=616603 sha256=e7f95147b4904d44b3ee9cf88f971cb1b8798c08fae30614006a36196f025eee
  Stored in directory: /root/.cache/pip/wheels/18/a6/a4/ca18c3051fcead866fe7b85700ee2240d883562a1bc70ce421
Successfully built torch-geometric
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.0.4


In [2]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.11.0+cu113
[K     |████████████████████████████████| 7.9 MB 8.9 MB/s 
[K     |████████████████████████████████| 3.5 MB 8.2 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [3]:
import os
import torch
import matplotlib.pyplot as plt

from torch_geometric.nn import GCNConv
from torch_geometric.datasets import TUDataset
from torch_geometric.transforms import NormalizeFeatures
import torch_geometric.transforms as T

transform = T.Compose([T.GCNNorm(), T.NormalizeFeatures()])

dataset = TUDataset(root="data/MUTAG", name="MUTAG", transform=None)
print(dataset)
data = dataset[0]
print(data)

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip


MUTAG(188)
Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])


Extracting data/MUTAG/MUTAG/MUTAG.zip
Processing...
Done!


In [4]:
print(f'Number of classes: {dataset.num_classes}')

Number of classes: 2


In [5]:
import numpy as np
torch.manual_seed(12345)
dataset = dataset.shuffle()

split = int(0.8 * len(dataset))
train_dataset, test_dataset = dataset[:split], dataset[split:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 150
Number of test graphs: 38


In [6]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2636], x=[1188, 7], edge_attr=[2636, 4], y=[64], batch=[1188], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2506], x=[1139, 7], edge_attr=[2506, 4], y=[64], batch=[1139], ptr=[65])

Step 3:
Number of graphs in the current batch: 22
DataBatch(edge_index=[2, 852], x=[387, 7], edge_attr=[852, 4], y=[22], batch=[387], ptr=[23])



In [7]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(7, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [8]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=64)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 31):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 002, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 003, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 004, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 005, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 006, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 007, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 008, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 009, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 010, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 011, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 012, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 013, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 014, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 015, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 016, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 017, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 018, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 019, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 020, Train Acc: 0.6467, Test Acc: 0.7368
Epoch: 021, Train Acc: 0.6467, Test Acc:

In [9]:
for data in test_loader:
  out = model(data.x, data.edge_index, data.batch) 
  print(F.softmax(out[0], dim=0)[0] > 2) 
  pred = out.argmax(dim=1) 
  print(pred[0].item()) 
  print(len(pred))
  break

tensor(False)
1
38


In [10]:
import os.path as osp
from re import sub
from sklearn import neighbors
import tqdm
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.datasets import TUDataset
from torch_geometric.nn import GCNConv, GNNExplainer

from torch_geometric.loader import DataLoader

from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.utils import subgraph
import numpy as np

import networkx as nx
from torch_geometric import utils

def expand(starts, ends, target, max_depth=1, prev=[]):
    node_neighbors = np.array([ends[idx] for idx, node in enumerate(starts) if node == target and ends[idx] != target])
    prev.append(target)
    if max_depth > 1:
        for n in node_neighbors:
            node_neighbors = np.concatenate((node_neighbors, expand(starts, ends, target=n, max_depth=max_depth-1, prev=prev)), axis=0)
    indices = np.unique(node_neighbors, return_index=True)[1]
    return np.array([node_neighbors[i] for i in sorted(indices)])


def process_one_graph(data):
    used = []
    num_nodes = data.x.shape[0]
    subgraph_sizes = [int(num_nodes/n) for n in range(2, 5)]
    start_nodes, end_nodes = np.array(data.edge_index)
    sub_graphs = []
    # make a grow from each node
    for target_idx in range(num_nodes):
        nodes_to_keep = expand(starts=start_nodes, ends=end_nodes, target=target_idx, max_depth=3, prev=[])
        if nodes_to_keep.shape[0] == 0:
            continue
        for size in subgraph_sizes:
            # select the grows based on size
            _subset = nodes_to_keep[:size]
            # remove repetitive stuff
            if set(_subset) in used:
                continue
            else:
                used.append(set(_subset))
            _subset = torch.from_numpy(np.array(_subset))
            sub_graphs.append((target_idx, data.subgraph(subset=_subset)))
    return sub_graphs



dataset = 'MUTAG'
path = osp.join(osp.dirname(osp.realpath("__file__")), '..', 'data', 'TUDataset')
transform = T.Compose([T.GCNNorm(), T.NormalizeFeatures()])
dataset = TUDataset(path, dataset, transform=None)

pred_list = []
all_subgraphs = []
for idx, data in enumerate(tqdm.tqdm(dataset)):
    out = model(data.x, data.edge_index, data.batch)  
    pred = out.argmax(dim=1) 
    pred_list.append(pred)

    subgraphs = process_one_graph(data)
    all_subgraphs.append(subgraphs)
    

# for target_idx, graph in tqdm.tqdm(all_subgraphs):
#     # nx.draw_networkx(utils.to_networkx(graph, remove_self_loops=True))
#     # plt.show()


Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Extracting /data/TUDataset/MUTAG/MUTAG.zip
Processing...
Done!
100%|██████████| 188/188 [00:09<00:00, 20.72it/s]


In [11]:
print(type(all_subgraphs))
for data in all_subgraphs:
  print(data[1])
  break



<class 'list'>
(0, Data(edge_index=[2, 8], x=[5, 7], edge_attr=[8, 4], y=[1]))


In [12]:
print(type(test_dataset))
for test_data in test_dataset:
  print(test_data)
  break

<class 'torch_geometric.datasets.tu_dataset.TUDataset'>
Data(edge_index=[2, 50], x=[22, 7], edge_attr=[50, 4], y=[1])


In [21]:
min_sufficient_explanation = [] #list of lists, each inner list correspond to one full graph in TUDataset Protein,
#containing each full graph's MSE's
threshold = 0.58
for idx, data_collection in enumerate(tqdm.tqdm(all_subgraphs)):
  sub_mse = [] #sub_mse for this collection, correspond to one original full graph
  for data in data_collection:
    out = model(data[1].x, data[1].edge_index, data[1].batch) 
    #use confidence 
    
    confidence = F.softmax(out[0], dim=0)
    pred = out.argmax(dim=1)
    print(confidence)
    #print((confidence[pred.item()] >= threshold).item())
    
      
    # if the subgraph's predicted label matches the original full graph's predicted, add to its min_suff_expl list
    if torch.eq(pred, pred_list[idx]).item() and (confidence[pred.item()] >= threshold).item():
      sub_mse.append(data)
  
  min_sufficient_explanation.append(sub_mse)

  

  1%|          | 1/188 [00:00<00:23,  8.04it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

  2%|▏         | 3/188 [00:00<00:18, 10.06it/s]

tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

  2%|▏         | 4/188 [00:00<00:21,  8.70it/s]

tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4214, 0.5786], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackw

  4%|▎         | 7/188 [00:00<00:21,  8.44it/s]

tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4214, 0.5786], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4213, 0.5787], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackward0>)
tensor([0.4195, 0.5805], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackw

  4%|▍         | 8/188 [00:00<00:22,  8.13it/s]

tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

  5%|▌         | 10/188 [00:01<00:20,  8.56it/s]

tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

  6%|▌         | 11/188 [00:01<00:21,  8.30it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

  7%|▋         | 13/188 [00:01<00:24,  7.10it/s]

tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4181, 0.5819], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

  8%|▊         | 15/188 [00:01<00:22,  7.66it/s]

tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackw

  9%|▊         | 16/188 [00:02<00:23,  7.39it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4181, 0.5819], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 10%|▉         | 18/188 [00:02<00:20,  8.15it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4246, 0.5754], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 11%|█         | 20/188 [00:02<00:21,  7.90it/s]

tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 11%|█         | 21/188 [00:02<00:20,  7.99it/s]

tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 12%|█▏        | 22/188 [00:02<00:21,  7.81it/s]


tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4201, 0.5799], grad_fn=<SoftmaxBackward0>)
tensor([0.4227, 0.5773], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBack

 12%|█▏        | 23/188 [00:02<00:23,  6.96it/s]

tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4213, 0.5787], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4225, 0.5775], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 13%|█▎        | 25/188 [00:03<00:25,  6.36it/s]

tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 15%|█▍        | 28/188 [00:03<00:23,  6.91it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackw

 15%|█▌        | 29/188 [00:03<00:23,  6.68it/s]

tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4225, 0.5775], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4225, 0.5775], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackw

 16%|█▌        | 30/188 [00:04<00:24,  6.38it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 17%|█▋        | 32/188 [00:04<00:23,  6.72it/s]

tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4239, 0.5761], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackw

 18%|█▊        | 33/188 [00:04<00:24,  6.35it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4253, 0.5747], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4290, 0.5710], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4222, 0.5778], grad_fn=<SoftmaxBackward0>)
tensor([0.4317, 0.5683], grad_fn=<SoftmaxBackward0>)
tensor([0.4297, 0.5703], grad_fn=<SoftmaxBackward0>)
tensor([0.4378, 0.5622], grad_fn=<SoftmaxBackward0>)
tensor([0.4384, 0.5616], grad_fn=<SoftmaxBackward0>)
tensor([0.4366, 0.5634], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackw

 19%|█▊        | 35/188 [00:04<00:21,  7.10it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackw

 20%|█▉        | 37/188 [00:04<00:19,  7.69it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackw

 20%|██        | 38/188 [00:05<00:19,  7.87it/s]

tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 21%|██▏       | 40/188 [00:05<00:17,  8.56it/s]

tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 22%|██▏       | 42/188 [00:05<00:17,  8.26it/s]

tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4205, 0.5795], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4210, 0.5790], grad_fn=<SoftmaxBackward0>)
tensor([0.4253, 0.5747], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4236, 0.5764], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackw

 23%|██▎       | 43/188 [00:05<00:19,  7.63it/s]

tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 23%|██▎       | 44/188 [00:05<00:18,  7.61it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackw

 24%|██▍       | 46/188 [00:06<00:18,  7.53it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackw

 26%|██▌       | 48/188 [00:06<00:18,  7.46it/s]

tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 26%|██▌       | 49/188 [00:06<00:19,  7.24it/s]

tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4222, 0.5778], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackw

 27%|██▋       | 51/188 [00:06<00:19,  6.87it/s]

tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4202, 0.5798], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4202, 0.5798], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 28%|██▊       | 52/188 [00:07<00:21,  6.27it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackw

 28%|██▊       | 53/188 [00:07<00:20,  6.69it/s]

tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4195, 0.5805], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackw

 29%|██▉       | 55/188 [00:07<00:18,  7.31it/s]

tensor([0.4240, 0.5760], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4227, 0.5773], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 30%|███       | 57/188 [00:07<00:17,  7.46it/s]

tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackw

 31%|███       | 58/188 [00:07<00:19,  6.64it/s]

tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 31%|███▏      | 59/188 [00:08<00:20,  6.32it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 32%|███▏      | 61/188 [00:08<00:18,  6.97it/s]

tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackw

 34%|███▎      | 63/188 [00:08<00:16,  7.79it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4246, 0.5754], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4205, 0.5795], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackw

 34%|███▍      | 64/188 [00:08<00:16,  7.41it/s]

tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 35%|███▌      | 66/188 [00:08<00:14,  8.14it/s]

tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 36%|███▌      | 67/188 [00:09<00:15,  7.98it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4222, 0.5778], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 37%|███▋      | 69/188 [00:09<00:16,  7.38it/s]

tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4166, 0.5834], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackw

 37%|███▋      | 70/188 [00:09<00:15,  7.73it/s]

tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4234, 0.5766], grad_fn=<SoftmaxBackward0>)
tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackw

 38%|███▊      | 72/188 [00:09<00:16,  6.95it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackw

 39%|███▉      | 73/188 [00:09<00:15,  7.26it/s]

tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 40%|███▉      | 75/188 [00:10<00:16,  6.97it/s]

tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackw

 41%|████      | 77/188 [00:10<00:13,  8.29it/s]

tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4246, 0.5754], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 42%|████▏     | 79/188 [00:10<00:13,  8.29it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4201, 0.5799], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 43%|████▎     | 80/188 [00:10<00:14,  7.64it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4166, 0.5834], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackw

 44%|████▎     | 82/188 [00:11<00:12,  8.17it/s]

tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackw

 44%|████▍     | 83/188 [00:11<00:12,  8.25it/s]

tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackw

 45%|████▌     | 85/188 [00:11<00:12,  8.38it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 46%|████▋     | 87/188 [00:11<00:12,  7.86it/s]


tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBack

 47%|████▋     | 89/188 [00:11<00:11,  8.89it/s]

tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackw

 48%|████▊     | 90/188 [00:12<00:12,  7.92it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackw

 48%|████▊     | 91/188 [00:12<00:13,  7.18it/s]

tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4195, 0.5805], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackw

 49%|████▉     | 93/188 [00:12<00:14,  6.61it/s]

tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 50%|█████     | 94/188 [00:12<00:14,  6.48it/s]

tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 51%|█████     | 95/188 [00:12<00:13,  6.83it/s]

tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4214, 0.5786], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackw

 52%|█████▏    | 97/188 [00:13<00:13,  6.74it/s]

tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4210, 0.5790], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4202, 0.5798], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 53%|█████▎    | 99/188 [00:13<00:11,  7.58it/s]

tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackw

 53%|█████▎    | 100/188 [00:13<00:11,  7.73it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackw

 54%|█████▍    | 102/188 [00:13<00:12,  6.99it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4210, 0.5790], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackw

 55%|█████▍    | 103/188 [00:13<00:12,  6.98it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackw

 55%|█████▌    | 104/188 [00:14<00:13,  6.24it/s]

tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackw

 56%|█████▌    | 105/188 [00:14<00:13,  6.35it/s]

tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackw

 57%|█████▋    | 107/188 [00:14<00:12,  6.25it/s]

tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 57%|█████▋    | 108/188 [00:14<00:12,  6.36it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 58%|█████▊    | 109/188 [00:14<00:13,  5.75it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4180, 0.5820], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 59%|█████▊    | 110/188 [00:15<00:12,  6.11it/s]

tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackw

 60%|█████▉    | 112/188 [00:15<00:10,  7.22it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackw

 61%|██████    | 114/188 [00:15<00:09,  8.07it/s]

tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4201, 0.5799], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 62%|██████▏   | 117/188 [00:15<00:08,  8.54it/s]

tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4246, 0.5754], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackw

 63%|██████▎   | 118/188 [00:16<00:09,  7.44it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackw

 63%|██████▎   | 119/188 [00:16<00:09,  7.49it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4225, 0.5775], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackw

 64%|██████▍   | 121/188 [00:16<00:08,  7.75it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackw

 65%|██████▌   | 123/188 [00:16<00:08,  7.55it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 66%|██████▋   | 125/188 [00:16<00:08,  7.71it/s]

tensor([0.4262, 0.5738], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4232, 0.5768], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4232, 0.5768], grad_fn=<SoftmaxBackward0>)
tensor([0.4262, 0.5738], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4246, 0.5754], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackw

 67%|██████▋   | 126/188 [00:17<00:08,  7.44it/s]

tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4227, 0.5773], grad_fn=<SoftmaxBackward0>)
tensor([0.4233, 0.5767], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4213, 0.5787], grad_fn=<SoftmaxBackw

 68%|██████▊   | 127/188 [00:17<00:08,  7.12it/s]

tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4224, 0.5776], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 69%|██████▊   | 129/188 [00:17<00:07,  7.79it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4210, 0.5790], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackw

 70%|██████▉   | 131/188 [00:17<00:06,  8.29it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackw

 71%|███████   | 133/188 [00:17<00:06,  8.47it/s]

tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4235, 0.5765], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4234, 0.5766], grad_fn=<SoftmaxBackward0>)
tensor([0.4214, 0.5786], grad_fn=<SoftmaxBackward0>)
tensor([0.4239, 0.5761], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackw

 71%|███████▏  | 134/188 [00:18<00:07,  7.33it/s]

tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackw

 72%|███████▏  | 136/188 [00:18<00:06,  7.59it/s]

tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4166, 0.5834], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackw

 73%|███████▎  | 137/188 [00:18<00:07,  7.17it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 73%|███████▎  | 138/188 [00:18<00:06,  7.55it/s]

tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4244, 0.5756], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackw

 74%|███████▍  | 140/188 [00:18<00:05,  8.36it/s]

tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackw

 76%|███████▌  | 142/188 [00:19<00:05,  8.36it/s]

tensor([0.4214, 0.5786], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4237, 0.5763], grad_fn=<SoftmaxBackward0>)
tensor([0.4240, 0.5760], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4202, 0.5798], grad_fn=<SoftmaxBackward0>)
tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackw

 77%|███████▋  | 145/188 [00:19<00:04,  9.12it/s]

tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackw

 78%|███████▊  | 147/188 [00:19<00:04,  8.91it/s]

tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4227, 0.5773], grad_fn=<SoftmaxBackward0>)
tensor([0.4274, 0.5726], grad_fn=<SoftmaxBackw

 79%|███████▊  | 148/188 [00:19<00:04,  8.52it/s]

tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackw

 80%|████████  | 151/188 [00:20<00:04,  8.95it/s]

tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4243, 0.5757], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4274, 0.5726], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4245, 0.5755], grad_fn=<SoftmaxBackward0>)
tensor([0.4286, 0.5714], grad_fn=<SoftmaxBackward0>)
tensor([0.4274, 0.5726], grad_fn=<SoftmaxBackward0>)
tensor([0.4224, 0.5776], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackw

 81%|████████  | 152/188 [00:20<00:04,  8.01it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4181, 0.5819], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackw

 81%|████████▏ | 153/188 [00:20<00:04,  7.88it/s]

tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackw

 83%|████████▎ | 156/188 [00:20<00:03,  8.86it/s]

tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4209, 0.5791], grad_fn=<SoftmaxBackward0>)
tensor([0.4246, 0.5754], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4233, 0.5767], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4225, 0.5775], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackw

 84%|████████▎ | 157/188 [00:20<00:03,  7.82it/s]

tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4213, 0.5787], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 84%|████████▍ | 158/188 [00:21<00:04,  7.28it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 85%|████████▍ | 159/188 [00:21<00:04,  6.95it/s]

tensor([0.4172, 0.5828], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4195, 0.5805], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackw

 86%|████████▌ | 161/188 [00:21<00:03,  7.07it/s]

tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4201, 0.5799], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4190, 0.5810], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackw

 86%|████████▌ | 162/188 [00:21<00:03,  6.88it/s]

tensor([0.4239, 0.5761], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4233, 0.5767], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4236, 0.5764], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackw

 87%|████████▋ | 163/188 [00:21<00:03,  6.41it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 88%|████████▊ | 165/188 [00:22<00:03,  6.32it/s]

tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4166, 0.5834], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 88%|████████▊ | 166/188 [00:22<00:03,  6.23it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackw

 89%|████████▉ | 167/188 [00:22<00:03,  6.04it/s]

tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackw

 90%|█████████ | 170/188 [00:22<00:02,  7.69it/s]

tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4229, 0.5771], grad_fn=<SoftmaxBackward0>)
tensor([0.4273, 0.5727], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4247, 0.5753], grad_fn=<SoftmaxBackward0>)
tensor([0.4227, 0.5773], grad_fn=<SoftmaxBackward0>)
tensor([0.4249, 0.5751], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4202, 0.5798], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackw

 91%|█████████▏| 172/188 [00:23<00:01,  8.08it/s]

tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4222, 0.5778], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackward0>)
tensor([0.4181, 0.5819], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4173, 0.5827], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackw

 92%|█████████▏| 173/188 [00:23<00:01,  7.67it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4207, 0.5793], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackw

 93%|█████████▎| 174/188 [00:23<00:02,  6.99it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4181, 0.5819], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4181, 0.5819], grad_fn=<SoftmaxBackw

 93%|█████████▎| 175/188 [00:23<00:01,  6.72it/s]

tensor([0.4212, 0.5788], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4227, 0.5773], grad_fn=<SoftmaxBackward0>)
tensor([0.4244, 0.5756], grad_fn=<SoftmaxBackward0>)
tensor([0.4221, 0.5779], grad_fn=<SoftmaxBackward0>)
tensor([0.4232, 0.5768], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4224, 0.5776], grad_fn=<SoftmaxBackward0>)
tensor([0.4216, 0.5784], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackw

 94%|█████████▍| 177/188 [00:23<00:01,  7.55it/s]

tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4230, 0.5770], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4205, 0.5795], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4218, 0.5782], grad_fn=<SoftmaxBackward0>)
tensor([0.4219, 0.5781], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4238, 0.5762], grad_fn=<SoftmaxBackw

 95%|█████████▌| 179/188 [00:23<00:01,  7.85it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4183, 0.5817], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4200, 0.5800], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4186, 0.5814], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4194, 0.5806], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4211, 0.5789], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4199, 0.5801], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackw

 96%|█████████▌| 180/188 [00:24<00:01,  6.73it/s]

tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

 97%|█████████▋| 182/188 [00:24<00:00,  7.62it/s]

tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackw

 97%|█████████▋| 183/188 [00:24<00:00,  6.92it/s]

tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4170, 0.5830], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4182, 0.5818], grad_fn=<SoftmaxBackward0>)
tensor([0.4191, 0.5809], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackw

 98%|█████████▊| 184/188 [00:24<00:00,  6.48it/s]

tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4184, 0.5816], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4196, 0.5804], grad_fn=<SoftmaxBackward0>)
tensor([0.4215, 0.5785], grad_fn=<SoftmaxBackward0>)
tensor([0.4228, 0.5772], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4225, 0.5775], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4210, 0.5790], grad_fn=<SoftmaxBackward0>)
tensor([0.4226, 0.5774], grad_fn=<SoftmaxBackward0>)
tensor([0.4223, 0.5777], grad_fn=<SoftmaxBackward0>)
tensor([0.4231, 0.5769], grad_fn=<SoftmaxBackward0>)
tensor([0.4248, 0.5752], grad_fn=<SoftmaxBackward0>)
tensor([0.4193, 0.5807], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4220, 0.5780], grad_fn=<SoftmaxBackw

 99%|█████████▉| 187/188 [00:25<00:00,  7.44it/s]

tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4197, 0.5803], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackward0>)
tensor([0.4192, 0.5808], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4165, 0.5835], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4167, 0.5833], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4169, 0.5831], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackward0>)
tensor([0.4171, 0.5829], grad_fn=<SoftmaxBackw

100%|██████████| 188/188 [00:25<00:00,  7.45it/s]

tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4187, 0.5813], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4185, 0.5815], grad_fn=<SoftmaxBackward0>)
tensor([0.4198, 0.5802], grad_fn=<SoftmaxBackward0>)
tensor([0.4204, 0.5796], grad_fn=<SoftmaxBackward0>)
tensor([0.4189, 0.5811], grad_fn=<SoftmaxBackward0>)
tensor([0.4203, 0.5797], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4195, 0.5805], grad_fn=<SoftmaxBackward0>)
tensor([0.4168, 0.5832], grad_fn=<SoftmaxBackward0>)
tensor([0.4188, 0.5812], grad_fn=<SoftmaxBackward0>)
tensor([0.4208, 0.5792], grad_fn=<SoftmaxBackward0>)
tensor([0.4206, 0.5794], grad_fn=<SoftmaxBackward0>)
tensor([0.4217, 0.5783], grad_fn=<SoftmaxBackw




In [22]:
for i in range(10):
  print(len(min_sufficient_explanation[i]))
  print(len(all_subgraphs[i]))
  print(len(min_sufficient_explanation[i]) < len(all_subgraphs[i]))
for data in all_subgraphs:
  print(data)
  break

for data in min_sufficient_explanation:
  print(len(data))
  break

#for example, the first three lines of outputs
#means 35 of the 43 subgraphs of the 0-th full graph
#are min sufficient explanations

35
43
True
18
32
True
21
32
True
38
44
True
16
28
True
33
56
True
30
37
True
35
46
True
8
29
True
35
43
True
[(0, Data(edge_index=[2, 14], x=[7, 7], edge_attr=[14, 4], y=[1])), (0, Data(edge_index=[2, 8], x=[5, 7], edge_attr=[8, 4], y=[1])), (0, Data(edge_index=[2, 6], x=[4, 7], edge_attr=[6, 4], y=[1])), (1, Data(edge_index=[2, 14], x=[7, 7], edge_attr=[14, 4], y=[1])), (1, Data(edge_index=[2, 8], x=[5, 7], edge_attr=[8, 4], y=[1])), (2, Data(edge_index=[2, 16], x=[8, 7], edge_attr=[16, 4], y=[1])), (2, Data(edge_index=[2, 6], x=[4, 7], edge_attr=[6, 4], y=[1])), (3, Data(edge_index=[2, 8], x=[5, 7], edge_attr=[8, 4], y=[1])), (3, Data(edge_index=[2, 2], x=[4, 7], edge_attr=[2, 4], y=[1])), (4, Data(edge_index=[2, 14], x=[8, 7], edge_attr=[14, 4], y=[1])), (4, Data(edge_index=[2, 8], x=[5, 7], edge_attr=[8, 4], y=[1])), (4, Data(edge_index=[2, 2], x=[4, 7], edge_attr=[2, 4], y=[1])), (5, Data(edge_index=[2, 6], x=[4, 7], edge_attr=[6, 4], y=[1])), (6, Data(edge_index=[2, 14], x=[8, 7]

In [15]:
#input: 
# pred_list: [tensor, tensor, tensor ...], len = len(TUDataset/Protein)
#   each element is a one-element tensor 
#   representing the predicted label of one graph in TUDataset/Protein
# all_subgraph: [[subgraph, subgraph..], [subgraph, subgraph..]...], len = len(TUDataset/Protein)
#   each element is a list of subgraphs obtained from one graph in TUDataset/Protein
#   each subgraph is a tuple in the form (k, Data(edge_index=[2, 24], x=[8, 0], y=[1]))
#   where k is the index of the node that this subgraph expanded from
#output:
# min_sufficient_explanation: [[subgraph, subgraph,..], [subgraph, subgraph]...]
#   len = len(TUDataset/Protein)
#   each element is a list of subgraphs which are the MSEs
#   each subgraph is a tuple in the form (k, Data(edge_index=[2, 24], x=[8, 0], y=[1]))
#   where k is the index of the node that this subgraph expanded from