# PyTorchでの学習・推論を高速化するコツ集
## MNISTでいろいろ試す
### 初期設定


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
# pytorch version 確認
torch.__version__  # 1.6.0+cu101


'1.6.0'

In [3]:
# GPU使用の確認
# Google Colaboratoryでは「ランタイム」→「ランタイムタイムを変更」でGPUに

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)  # cuda(GPU)


cuda


In [4]:
# GPUの確認
!nvidia-smi

Mon Sep  7 10:35:30 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05    Driver Version: 450.51.05    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   29C    P0    22W / 300W |      2MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### ネットワーク・モデルの設定

In [5]:
# 参考: https://github.com/pytorch/examples/blob/master/mnist/main.py

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
        

## データセットと前処理の設定


In [6]:
# 前処理
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])

# 訓練データ
dataset1 = datasets.MNIST('.', train=True, download=True,
                    transform=transform)

# 検証データ
dataset2 = datasets.MNIST('.', train=False,
                    transform=transform)


## 訓練と検証の関数作成

In [7]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()  # 訓練モードに
    for batch_idx, (data, target) in enumerate(train_loader):
        # データ取り出し
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()

        # 伝搬
        output = model(data)
        
        # 損失計算とバックプロパゲーション
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


In [8]:
def test(model, device, test_loader):
    model.eval()  # 検証モードに
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            # データ取り出し
            data, target = data.to(device), target.to(device)
            output = model(data)
            
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

## 1. DataLoaderについて

In [9]:
# CPUのコア数を確認
import os
os.cpu_count()  # コア


8

In [10]:
# 関数化
import time

def MNIST_train(optimizer, model, device, train_loader, test_loader): 
    # デフォルトで訓練
    epochs = 1

    start = time.time()

    # 処理
    for epoch in range(1, epochs+1):
        train(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)

    # かかった時間
    print("=======かかった時間========")
    print(time.time() - start)


### 1.1.1 デフォルト

In [11]:
# ミニバッチのサイズ
mini_batch_size = 512

In [12]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [13]:
# デフォルト設定のDataLoaderの場合
train_loader_default = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size)
test_loader_default = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size)


In [14]:
MNIST_train(optimizer, model, device, train_loader_default, test_loader_default)


Test set: Average loss: 0.3339, Accuracy: 8959/10000 (90%)

14.73472261428833


### 1.1.2 DataLoaderの引数num_workersを設定

In [15]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [16]:
# データローダー
train_loader_nworker = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=os.cpu_count()) 
test_loader_nworker = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=os.cpu_count())

In [17]:
MNIST_train(optimizer, model, device, train_loader_nworker, test_loader_nworker)


Test set: Average loss: 0.2997, Accuracy: 9091/10000 (91%)

3.472299337387085


### 1.1.3 DataLoaderの引数pin_memoryをTrueに設定

In [18]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [19]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, pin_memory=True)

In [20]:
MNIST_train(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Test set: Average loss: 0.2946, Accuracy: 9080/10000 (91%)

13.656277179718018


In [21]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True)

In [22]:
MNIST_train(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Test set: Average loss: 0.1847, Accuracy: 9443/10000 (94%)

3.5092411041259766


## 2. torch.backends.cudnn.benchmark = True

In [23]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [24]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=False) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=False)

In [25]:
# 関数化

def MNIST_train_cudnn_benchmark_True(optimizer, model, device, train_loader, test_loader): 
    # デフォルトで訓練
    epochs = 1

    start = time.time()
    
    # 追加
    torch.backends.cudnn.benchmark = True

    # 処理
    for epoch in range(1, epochs+1):
        train(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)

    # かかった時間
    print("=======かかった時間========")
    print(time.time() - start)
    

In [26]:
MNIST_train_cudnn_benchmark_True(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Test set: Average loss: 0.3198, Accuracy: 8977/10000 (90%)

14.470812797546387


In [27]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [28]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True)

In [29]:
MNIST_train_cudnn_benchmark_True(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Test set: Average loss: 0.2963, Accuracy: 9073/10000 (91%)

3.4936954975128174


## JITで単純な計算

In [91]:
x = torch.randn(2000, 30, 200)


In [92]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

In [93]:
x = torch.randn(2000, 3000)


In [94]:
start = time.time()

for i in range(200):
    gelu(x)

# かかった時間
print("=======かかった時間========")
print(time.time() - start)


9.846721410751343


In [95]:
@torch.jit.script
def fused_gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

In [96]:
start = time.time()

for i in range(200):
    fused_gelu(x)
    
# かかった時間
print("=======かかった時間========")
print(time.time() - start)

6.605190277099609


## PyTorch AMP

In [35]:
# https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples

In [36]:
def train_PyTorchAMP(model, device, train_loader, optimizer, epoch):
    model.train()  # 訓練モードに
    
    scaler = torch.cuda.amp.GradScaler()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        # データ取り出し
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()

        # 伝搬
        # Runs the forward pass with autocasting.
        with torch.cuda.amp.autocast():
            output = model(data)
            loss = F.nll_loss(output, target)
        
        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        scaler.scale(loss).backward()

        
        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [37]:
# 関数化

def MNIST_train_PyTorchAMP(optimizer, model, device, train_loader, test_loader): 
    # デフォルトで訓練
    epochs = 1
    
    start = time.time()
    
    # 追加
    torch.backends.cudnn.benchmark = True

    # 処理
    for epoch in range(1, epochs+1):
        train_PyTorchAMP(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)

    # かかった時間
    print("=======かかった時間========")
    print(time.time() - start)

In [38]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)


In [39]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)


In [40]:
MNIST_train_PyTorchAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 0.3094, Accuracy: 9051/10000 (91%)

14.210430145263672


In [41]:
#

In [42]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)


In [43]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True)

In [44]:
MNIST_train_PyTorchAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 0.2866, Accuracy: 9116/10000 (91%)

3.5828371047973633


# APX

In [45]:
# 以下は
# https://github.com/NVIDIA/apex
# を参考にAPXをインストールしておく

#$ git clone https://github.com/NVIDIA/apex
#$ cd apex
#s$ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

In [46]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [47]:
from apex import amp, optimizers

# Initialization
opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [48]:
def trainAMP(model, device, train_loader, optimizer, epoch):
    model.train()  # 訓練モードに
        
    for batch_idx, (data, target) in enumerate(train_loader):
        # データ取り出し
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()

        # 伝搬
        output = model(data)
        
        # 損失計算とバックプロパゲーション
        loss = F.nll_loss(output, target)
        
        # AMP Train your model
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


In [49]:
# 関数化

def MNIST_trainAMP(optimizer, model, device, train_loader, test_loader): 
    # デフォルトで訓練
    epochs = 1

    start = time.time()
    
    # 追加
    torch.backends.cudnn.benchmark = True

    # 処理
    for epoch in range(1, epochs+1):
        trainAMP(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)

    # かかった時間
    print("=======かかった時間========")
    print(time.time() - start)
    

In [50]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)

In [51]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Test set: Average loss: 0.2930, Accuracy: 9081/10000 (91%)

20.411823987960815


In [52]:
# 

In [53]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [54]:
from apex import amp, optimizers

# Initialization
opt_level = 'O2'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


In [55]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)

In [56]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Test set: Average loss: 0.3044, Accuracy: 9072/10000 (91%)

19.5732102394104


In [57]:
#

In [58]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [59]:
from apex import amp, optimizers

# Initialization
opt_level = 'O3'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O3:  Pure FP16 training.
Defaults for this optimization level are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0


In [60]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)


In [61]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 0.2752, Accuracy: 9165/10000 (92%)

19.248154401779175


In [62]:
#

In [63]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [64]:
from apex import amp, optimizers

# Initialization
opt_level = 'O0'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O0:  Pure FP32 training.

Defaults for this optimization level are:
enabled                : True
opt_level              : O0
cast_model_type        : torch.float32
patch_torch_functions  : False
keep_batchnorm_fp32    : None
master_weights         : False
loss_scale             : 1.0
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O0
cast_model_type        : torch.float32
patch_torch_functions  : False
keep_batchnorm_fp32    : None
master_weights         : False
loss_scale             : 1.0


In [65]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)


In [66]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 2.3060, Accuracy: 899/10000 (9%)

20.49299645423889


In [67]:
#

In [68]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = optim.Adadelta(model.parameters(), lr=lr_rate)

In [69]:
from apex import amp, optimizers

# Initialization
opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [70]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True)


In [71]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 0.2769, Accuracy: 9172/10000 (92%)

5.2218098640441895


## APEX

In [72]:
import apex

# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = apex.optimizers.FusedSGD(model.parameters(), lr=lr_rate)

In [73]:
from apex import amp, optimizers

# Initialization
opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [74]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)


In [75]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 0.2994, Accuracy: 9058/10000 (91%)

23.800689458847046


In [76]:
#

In [77]:
import apex

# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr_rate)


In [78]:
from apex import amp, optimizers

# Initialization
opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [79]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=0, pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=0, pin_memory=True)


In [80]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 512.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 256.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 128.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 64.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16.0
Gradient overflow.  Skipping step, loss scaler 0 reducing l

In [81]:
#

In [82]:
# モデル、学習率とoptimizerを設定
model = Net().to(device)
lr_rate = 0.1
optimizer = apex.optimizers.FusedSGD(model.parameters(), lr=lr_rate)

In [83]:
from apex import amp, optimizers

# Initialization
opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [84]:
# データローダー
train_loader_pin_memory = torch.utils.data.DataLoader(dataset1,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True) 
test_loader_pin_memory = torch.utils.data.DataLoader(dataset2,batch_size=mini_batch_size, num_workers=os.cpu_count(), pin_memory=True)

In [85]:
MNIST_trainAMP(optimizer, model, device, train_loader_pin_memory, test_loader_pin_memory)



Test set: Average loss: 0.3770, Accuracy: 8745/10000 (87%)

6.696407318115234


以上
